From 06533017549285ac6e7581692263b664c8c5e698 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Thu, 15 Dec 2022 18:49:12 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20replace=20arrow2=20with=20official=20im?= =?UTF-8?q?plementation=20=F0=9F=8E=89=20(#753)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: kick off. change datafusion/arrow/parquet to target version Signed-off-by: Ruihang Xia * chore: replace one last datafusion dep Signed-off-by: Ruihang Xia * feat: arrow_array switch to arrow * chore: update dep of binary vector * chore: fix wrong merge commit Signed-off-by: Ruihang Xia * feat: Switch to datatypes2 * feat: Make recordbatch compile * chore: sort Cargo.toml * feat: Fix common::recordbatch compiler errors * feat: Fix recordbatch test compiling issue * fix: api crate (#708) * fix: rename ConcreteDataType::timestamp_millis_type to ConcreteDataType::timestamp_millisecond_type. fix other warnings regarding timestamp * fix: revert changes in datatypes2 * fix: helper * chore: delete datatypes based on arrow2 * feat: Fix some compiler errors in common::query (#710) * feat: Fix some compiler errors in common::query * feat: test_collect use vectors api * fix: common-query subcrate (#712) * fix: record batch adapter Signed-off-by: Ruihang Xia * fix error enum Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * fix: Fix common::query compiler errors (#713) * feat: Move conversion to ScalarValue to value.rs * fix: Fix common::query compiler errors This commit also make InnerError pub(crate) * feat: Implements diff accumulator using WrapperType (#715) * feat: Remove usage of opaque error from common::recordbatch * feat: Remove opaque error from common::query * feat: Fix diff compiler errors Now common_function just use common_query's Error and Result. Adds a LargestType associated type to LogicalPrimitiveType to get the largest type a logical primitive type can cast to. * feat: Remove LargestType from NativeType trait * chore: Update comments * feat: Restrict Scalar::RefType of WrapperType to itself Add trait bound `for<'a> Scalar = Self>` to WrapperType * chore: Address CR comments * chore: Format codes * fix: fix compile error for mean/polyval/pow/interp ops Signed-off-by: Ruihang Xia * Revert "fix: fix compile error for mean/polyval/pow/interp ops" This reverts commit fb0b4eb826ffd034ffa760ba439bf220c9eca223. * fix: Fix compiler errors in argmax/rate/median/norm_cdf (#716) * fix: Fix compiler errors in argmax/rate/median/norm_cdf * chore: Address CR comments * fix: fix compile error for mean/polyval/pow/interp ops (#717) * fix: fix compile error for mean/polyval/pow/interp ops Signed-off-by: Ruihang Xia * simplify type bounds Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * fix: fix argmin/percentile/clip/interp/scipy_stats_norm_pdf errors (#718) fix: fix argmin/percentile/clip/interp/scipy_stats_norm_pdf compiler errors * fix: fix other compile error in common-function (#719) * further fixing Signed-off-by: Ruihang Xia * fix all compile errors in common function Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * fix: Fix tests and clippy for common-function subcrate (#726) * further fixing Signed-off-by: Ruihang Xia * fix all compile errors in common function Signed-off-by: Ruihang Xia * fix tests Signed-off-by: Ruihang Xia * fix clippy Signed-off-by: Ruihang Xia * revert test changes Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * fix: row group pruning (#725) * fix: row group pruning * chore: use macro to simplify stats implemetation * fxi: CR comments * fix: row group metadata length mismatch * fix: simplify code * fix: Fix common::grpc compiler errors (#722) * fix: Fix common::grpc compiler errors This commit refactors RecordBatch and holds vectors in the RecordBatch struct, so we don't need to cast the array to vector when doing serialization or iterating the batch. Now we use the vector API instead of the arrow API in grpc crate. * chore: Address CR comments * fix common record batch Signed-off-by: Ruihang Xia * fix: Fix compile error in server subcrate (#727) * fix: Fix compile error in server subcrate Signed-off-by: Ruihang Xia * remove unused type alias Signed-off-by: Ruihang Xia * explicitly panic Signed-off-by: Ruihang Xia * Update src/storage/src/sst/parquet.rs Co-authored-by: Yingwen Signed-off-by: Ruihang Xia Co-authored-by: Yingwen * fix: Fix common grpc expr (#730) * fix compile errors Signed-off-by: Ruihang Xia * rename fn names Signed-off-by: Ruihang Xia * fix styles Signed-off-by: Ruihang Xia * fix wranings in common-time Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * fix: pre-cast to avoid tremendous match arms (#734) Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * feat: upgrade storage crate to arrow and parquet offcial impl (#738) * fix: compile erros * fix: parquet reader and writer * fix: parquet reader and writer * fix: WriteBatch IPC encode/decode * fix: clippy errors in storage subcrate * chore: remove suspicious unwrap * fix: some cr comments * fix: CR comments * fix: CR comments * fix: Fix compiler errors in catalog and mito crates (#742) * fix: Fix compiler errors in mito * fix: Fix compiler errors in catalog crate * style: Fix clippy * chore: Fix use * Merge pull request #745 * fix nyc-taxi and util * Merge branch 'replace-arrow2' into fix-others * fix substrait * fix warnings and error in test * fix: Fix imports in optimizer.rs * fix: errors in optimzer * fix: remove unwrap * fix: Fix compiler errors in query crate (#746) * fix: Fix compiler errors in state.rs * fix: fix compiler errors in state * feat: upgrade sqlparser to 0.26 * fix: fix datafusion engine compiler errors * fix: Fix some tests in query crate * fix: Fix all warnings in tests * feat: Remove `Type` from timestamp's type name * fix: fix query tests Now datafusion already supports median, so this commit also remove the median function * style: Fix clippy * feat: Remove RecordBatch::pretty_print * chore: Address CR comments * Update src/query/src/query_engine/state.rs Co-authored-by: Ruihang Xia * fix: frontend compile errors (#747) fix: fix compile errors in frontend * fix: Fix compiler errors in script crate (#749) * fix: Fix compiler errors in state.rs * fix: fix compiler errors in state * feat: upgrade sqlparser to 0.26 * fix: fix datafusion engine compiler errors * fix: Fix some tests in query crate * fix: Fix all warnings in tests * feat: Remove `Type` from timestamp's type name * fix: fix query tests Now datafusion already supports median, so this commit also remove the median function * style: Fix clippy * feat: Remove RecordBatch::pretty_print * chore: Address CR comments * feat: Add column_by_name to RecordBatch * feat: modify select_from_rb * feat: Fix some compiler errors in vector.rs * feat: Fix more compiler errors in vector.rs * fix: fix table.rs Signed-off-by: Ruihang Xia * fix: Fix compiler errors in coprocessor * fix: Fix some compiler errors * fix: Fix compiler errors in script * chore: Remove unused imports and format code * test: disable interval tests * test: Fix test_compile_execute test * style: Fix clippy * feat: Support interval * feat: Add RecordBatch::columns and fix clippy Signed-off-by: Ruihang Xia Co-authored-by: Ruihang Xia * fix: Fix All The Tests! (#752) * fix: Fix several tests compile errors Signed-off-by: Ruihang Xia * fix: some compile errors in tests Signed-off-by: Ruihang Xia * fix: compile errors in frontend tests * fix: compile errors in frontend tests * test: Fix tests in api and common-query * test: Fix test in sql crate * fix: resolve substrait error Signed-off-by: Ruihang Xia * chore: add more test * test: Fix tests in servers * fix instance_test Signed-off-by: Ruihang Xia * test: Fix tests in tests-integration Signed-off-by: Ruihang Xia Co-authored-by: Lei, HUANG Co-authored-by: evenyag * fix: clippy errors Signed-off-by: Ruihang Xia Co-authored-by: Ruihang Xia Co-authored-by: evenyag --- Cargo.lock | 1562 +++++++++-------- Cargo.toml | 1 - benchmarks/Cargo.toml | 4 +- benchmarks/src/bin/nyc-taxi.rs | 22 +- src/api/greptime/v1/column.proto | 10 +- src/api/src/helper.rs | 104 +- src/catalog/Cargo.toml | 4 +- src/catalog/src/error.rs | 23 +- src/catalog/src/helper.rs | 2 +- src/catalog/src/local/manager.rs | 33 +- src/catalog/src/system.rs | 25 +- src/catalog/src/tables.rs | 94 +- src/client/Cargo.toml | 4 +- src/client/examples/logical.rs | 2 +- src/client/src/database.rs | 5 +- src/cmd/src/frontend.rs | 6 +- src/cmd/src/standalone.rs | 6 +- src/common/function/Cargo.toml | 2 +- src/common/function/src/error.rs | 69 - src/common/function/src/lib.rs | 1 - src/common/function/src/scalars.rs | 1 - .../{aggregate/mod.rs => aggregate.rs} | 3 - .../function/src/scalars/aggregate/argmax.rs | 39 +- .../function/src/scalars/aggregate/argmin.rs | 38 +- .../function/src/scalars/aggregate/diff.rs | 75 +- .../function/src/scalars/aggregate/mean.rs | 39 +- .../function/src/scalars/aggregate/median.rs | 289 --- .../src/scalars/aggregate/percentile.rs | 108 +- .../function/src/scalars/aggregate/polyval.rs | 83 +- .../scalars/aggregate/scipy_stats_norm_cdf.rs | 61 +- .../scalars/aggregate/scipy_stats_norm_pdf.rs | 62 +- .../{expression/mod.rs => expression.rs} | 0 .../function/src/scalars/expression/binary.rs | 32 +- .../function/src/scalars/expression/ctx.rs | 3 +- .../function/src/scalars/expression/unary.rs | 5 +- src/common/function/src/scalars/function.rs | 3 +- .../src/scalars/{math/mod.rs => math.rs} | 0 src/common/function/src/scalars/math/pow.rs | 5 +- src/common/function/src/scalars/math/rate.rs | 46 +- .../src/scalars/{numpy/mod.rs => numpy.rs} | 1 - src/common/function/src/scalars/numpy/clip.rs | 99 +- .../function/src/scalars/numpy/interp.rs | 119 +- src/common/function/src/scalars/test.rs | 2 +- .../{timestamp/mod.rs => timestamp.rs} | 0 .../src/scalars/timestamp/from_unixtime.rs | 36 +- src/common/function/src/scalars/udf.rs | 5 +- src/common/grpc-expr/src/insert.rs | 104 +- src/common/grpc/Cargo.toml | 4 +- src/common/grpc/src/select.rs | 219 ++- src/common/grpc/src/writer.rs | 52 +- src/common/query/Cargo.toml | 8 +- src/common/query/src/error.rs | 142 +- .../{logical_plan/mod.rs => logical_plan.rs} | 6 +- .../query/src/logical_plan/accumulator.rs | 351 +--- src/common/query/src/logical_plan/expr.rs | 4 +- src/common/query/src/logical_plan/udaf.rs | 2 +- src/common/query/src/physical_plan.rs | 84 +- src/common/query/src/signature.rs | 2 +- src/common/recordbatch/Cargo.toml | 6 +- src/common/recordbatch/src/adapter.rs | 28 +- src/common/recordbatch/src/error.rs | 30 +- src/common/recordbatch/src/lib.rs | 28 +- src/common/recordbatch/src/recordbatch.rs | 154 +- src/common/recordbatch/src/util.rs | 50 +- src/common/substrait/Cargo.toml | 6 +- src/common/substrait/src/context.rs | 2 +- src/common/substrait/src/df_expr.rs | 50 +- src/common/substrait/src/df_logical.rs | 47 +- src/common/time/src/date.rs | 9 +- src/common/time/src/datetime.rs | 7 +- src/common/time/src/timestamp.rs | 97 +- src/common/time/src/util.rs | 4 +- src/datanode/Cargo.toml | 9 +- src/datanode/src/server/grpc.rs | 10 +- src/datanode/src/sql.rs | 12 +- src/datanode/src/sql/create.rs | 2 +- src/datanode/src/sql/insert.rs | 16 +- src/datanode/src/tests/instance_test.rs | 236 ++- src/datatypes/Cargo.toml | 18 +- src/datatypes/src/arrow_array.rs | 217 +-- src/datatypes/src/data_type.rs | 234 ++- src/datatypes/src/error.rs | 6 + src/datatypes/src/lib.rs | 3 +- src/datatypes/src/macros.rs | 48 +- src/datatypes/src/prelude.rs | 6 +- src/datatypes/src/scalars.rs | 99 +- src/datatypes/src/schema.rs | 304 +--- .../src/schema/column_schema.rs | 0 src/datatypes/src/schema/constraint.rs | 16 +- src/datatypes/src/schema/raw.rs | 10 +- .../src/timestamp.rs | 18 + src/datatypes/src/type_id.rs | 14 +- src/datatypes/src/types.rs | 22 +- src/datatypes/src/types/binary_type.rs | 4 + src/datatypes/src/types/boolean_type.rs | 4 + src/datatypes/src/types/date.rs | 54 - .../src/types/date_type.rs | 1 + src/datatypes/src/types/datetime.rs | 61 - .../src/types/datetime_type.rs | 1 + src/datatypes/src/types/list_type.rs | 34 +- src/datatypes/src/types/null_type.rs | 6 +- src/datatypes/src/types/primitive_traits.rs | 138 -- src/datatypes/src/types/primitive_type.rs | 389 ++-- src/datatypes/src/types/string_type.rs | 11 +- src/datatypes/src/types/timestamp.rs | 125 -- .../src/types/timestamp_type.rs | 44 +- src/datatypes/src/value.rs | 484 +++-- src/datatypes/src/vectors.rs | 184 +- src/datatypes/src/vectors/binary.rs | 77 +- src/datatypes/src/vectors/boolean.rs | 69 +- src/datatypes/src/vectors/builder.rs | 494 ------ src/datatypes/src/vectors/constant.rs | 64 +- src/datatypes/src/vectors/date.rs | 282 +-- src/datatypes/src/vectors/datetime.rs | 278 +-- src/datatypes/src/vectors/eq.rs | 50 +- src/datatypes/src/vectors/helper.rs | 221 ++- src/datatypes/src/vectors/list.rs | 490 ++++-- src/datatypes/src/vectors/mutable.rs | 54 - src/datatypes/src/vectors/null.rs | 29 +- src/datatypes/src/vectors/operations.rs | 46 +- .../src/vectors/operations/filter.rs | 29 +- .../src/vectors/operations/find_unique.rs | 18 +- .../src/vectors/operations/replicate.rs | 37 +- src/datatypes/src/vectors/primitive.rs | 388 ++-- src/datatypes/src/vectors/string.rs | 243 +-- src/datatypes/src/vectors/timestamp.rs | 312 +--- .../src/vectors/validity.rs | 0 src/datatypes2/Cargo.toml | 24 - src/datatypes2/src/arrow_array.rs | 242 --- src/datatypes2/src/data_type.rs | 486 ----- src/datatypes2/src/error.rs | 144 -- src/datatypes2/src/lib.rs | 33 - src/datatypes2/src/macros.rs | 68 - src/datatypes2/src/prelude.rs | 20 - src/datatypes2/src/scalars.rs | 443 ----- src/datatypes2/src/schema.rs | 430 ----- src/datatypes2/src/schema/constraint.rs | 306 ---- src/datatypes2/src/schema/raw.rs | 77 - src/datatypes2/src/serialize.rs | 20 - src/datatypes2/src/type_id.rs | 93 - src/datatypes2/src/types.rs | 37 - src/datatypes2/src/types/binary_type.rs | 60 - src/datatypes2/src/types/boolean_type.rs | 59 - src/datatypes2/src/types/list_type.rs | 95 - src/datatypes2/src/types/null_type.rs | 58 - src/datatypes2/src/types/primitive_type.rs | 358 ---- src/datatypes2/src/types/string_type.rs | 60 - src/datatypes2/src/value.rs | 1275 -------------- src/datatypes2/src/vectors.rs | 309 ---- src/datatypes2/src/vectors/binary.rs | 353 ---- src/datatypes2/src/vectors/boolean.rs | 371 ---- src/datatypes2/src/vectors/constant.rs | 218 --- src/datatypes2/src/vectors/date.rs | 103 -- src/datatypes2/src/vectors/datetime.rs | 116 -- src/datatypes2/src/vectors/eq.rs | 228 --- src/datatypes2/src/vectors/helper.rs | 431 ----- src/datatypes2/src/vectors/list.rs | 747 -------- src/datatypes2/src/vectors/null.rs | 282 --- src/datatypes2/src/vectors/operations.rs | 127 -- .../src/vectors/operations/filter.rs | 145 -- .../src/vectors/operations/find_unique.rs | 367 ---- .../src/vectors/operations/replicate.rs | 170 -- src/datatypes2/src/vectors/primitive.rs | 552 ------ src/datatypes2/src/vectors/string.rs | 370 ---- src/datatypes2/src/vectors/timestamp.rs | 31 - src/frontend/Cargo.toml | 9 +- src/frontend/src/error.rs | 13 + src/frontend/src/expr_factory.rs | 2 +- src/frontend/src/instance.rs | 71 +- src/frontend/src/instance/distributed.rs | 30 +- src/frontend/src/instance/opentsdb.rs | 20 +- src/frontend/src/mysql.rs | 6 +- src/frontend/src/postgres.rs | 4 +- src/frontend/src/spliter.rs | 109 +- src/frontend/src/sql.rs | 21 +- src/frontend/src/table.rs | 49 +- src/frontend/src/table/insert.rs | 27 +- src/frontend/src/table/scan.rs | 13 +- src/mito/Cargo.toml | 6 +- src/mito/src/engine.rs | 166 +- src/mito/src/manifest/action.rs | 6 +- src/mito/src/table.rs | 5 +- src/mito/src/table/test_util/mock_engine.rs | 9 +- src/query/Cargo.toml | 11 +- src/query/src/datafusion.rs | 62 +- src/query/src/datafusion/planner.rs | 30 +- src/query/src/expr.rs | 13 - src/query/src/optimizer.rs | 63 +- src/query/src/plan.rs | 2 +- src/query/src/query_engine/state.rs | 114 +- src/query/src/sql.rs | 9 +- src/query/tests/argmax_test.rs | 34 +- src/query/tests/argmin_test.rs | 33 +- src/query/tests/function.rs | 30 +- src/query/tests/mean_test.rs | 27 +- src/query/tests/my_sum_udaf_example.rs | 107 +- src/query/tests/percentile_test.rs | 31 +- src/query/tests/polyval_test.rs | 39 +- src/query/tests/pow.rs | 6 +- src/query/tests/query_engine_test.rs | 257 +-- src/query/tests/scipy_stats_norm_cdf_test.rs | 28 +- src/query/tests/scipy_stats_norm_pdf.rs | 28 +- src/script/Cargo.toml | 9 +- src/script/src/python/builtins/mod.rs | 183 +- src/script/src/python/builtins/test.rs | 89 +- src/script/src/python/coprocessor.rs | 214 +-- src/script/src/python/engine.rs | 47 +- src/script/src/python/error.rs | 10 +- src/script/src/python/test.rs | 113 +- src/script/src/python/utils.rs | 22 +- src/script/src/python/vector.rs | 346 ++-- src/script/src/table.rs | 39 +- src/servers/src/http.rs | 2 +- src/servers/src/http/influxdb.rs | 24 +- src/servers/src/influxdb.rs | 10 +- src/servers/src/line_writer.rs | 49 +- src/servers/src/mysql/federated.rs | 99 +- src/servers/src/mysql/server.rs | 4 +- src/servers/src/opentsdb/codec.rs | 8 +- src/servers/src/postgres/handler.rs | 2 +- src/servers/src/postgres/server.rs | 4 +- src/servers/src/prometheus.rs | 18 +- src/servers/tests/mysql/mysql_server_test.rs | 22 +- src/servers/tests/postgres/mod.rs | 20 +- src/sql/Cargo.toml | 2 +- src/sql/src/ast.rs | 2 +- src/sql/src/parser.rs | 19 +- src/sql/src/parsers/create_parser.rs | 2 +- src/sql/src/statements.rs | 37 +- src/sql/src/statements/insert.rs | 2 +- src/storage/Cargo.toml | 3 +- src/storage/benches/memtable/mod.rs | 10 +- .../benches/memtable/util/regiondesc_util.rs | 2 +- src/storage/benches/wal/util/mod.rs | 6 +- src/storage/proto/write_batch.proto | 14 +- src/storage/src/error.rs | 20 +- src/storage/src/manifest/action.rs | 10 +- src/storage/src/memtable/btree.rs | 14 +- src/storage/src/memtable/inserter.rs | 15 +- src/storage/src/memtable/tests.rs | 49 +- src/storage/src/metadata.rs | 31 +- src/storage/src/proto/write_batch.rs | 88 +- src/storage/src/read/merge.rs | 6 +- src/storage/src/region/tests.rs | 39 +- src/storage/src/region/tests/alter.rs | 23 +- src/storage/src/region/tests/projection.rs | 14 +- src/storage/src/schema.rs | 7 +- src/storage/src/schema/compat.rs | 137 +- src/storage/src/schema/projected.rs | 7 +- src/storage/src/schema/region.rs | 4 +- src/storage/src/schema/store.rs | 50 +- src/storage/src/sst.rs | 2 +- src/storage/src/sst/parquet.rs | 387 ++-- src/storage/src/test_util/descriptor_util.rs | 2 +- src/storage/src/test_util/read_util.rs | 24 +- src/storage/src/write_batch.rs | 94 +- src/storage/src/write_batch/compat.rs | 12 +- src/store-api/src/storage/chunk.rs | 1 + src/store-api/src/storage/descriptors.rs | 8 +- src/table/Cargo.toml | 10 +- src/table/src/error.rs | 6 +- src/table/src/metadata.rs | 24 +- src/table/src/predicate.rs | 116 +- src/table/src/predicate/stats.rs | 148 +- src/table/src/table/adapter.rs | 9 +- src/table/src/table/numbers.rs | 11 +- src/table/src/table/scan.rs | 14 +- src/table/src/test_util/memtable.rs | 41 +- tests-integration/src/test_util.rs | 4 +- tests-integration/tests/grpc.rs | 6 +- tests-integration/tests/http.rs | 6 +- tests/runner/src/util.rs | 19 +- 272 files changed, 6718 insertions(+), 17621 deletions(-) delete mode 100644 src/common/function/src/error.rs rename src/common/function/src/scalars/{aggregate/mod.rs => aggregate.rs} (96%) delete mode 100644 src/common/function/src/scalars/aggregate/median.rs rename src/common/function/src/scalars/{expression/mod.rs => expression.rs} (100%) rename src/common/function/src/scalars/{math/mod.rs => math.rs} (100%) rename src/common/function/src/scalars/{numpy/mod.rs => numpy.rs} (98%) rename src/common/function/src/scalars/{timestamp/mod.rs => timestamp.rs} (100%) rename src/common/query/src/{logical_plan/mod.rs => logical_plan.rs} (97%) rename src/{datatypes2 => datatypes}/src/schema/column_schema.rs (100%) rename src/{datatypes2 => datatypes}/src/timestamp.rs (89%) delete mode 100644 src/datatypes/src/types/date.rs rename src/{datatypes2 => datatypes}/src/types/date_type.rs (98%) delete mode 100644 src/datatypes/src/types/datetime.rs rename src/{datatypes2 => datatypes}/src/types/datetime_type.rs (98%) delete mode 100644 src/datatypes/src/types/primitive_traits.rs delete mode 100644 src/datatypes/src/types/timestamp.rs rename src/{datatypes2 => datatypes}/src/types/timestamp_type.rs (81%) delete mode 100644 src/datatypes/src/vectors/builder.rs delete mode 100644 src/datatypes/src/vectors/mutable.rs rename src/{datatypes2 => datatypes}/src/vectors/validity.rs (100%) delete mode 100644 src/datatypes2/Cargo.toml delete mode 100644 src/datatypes2/src/arrow_array.rs delete mode 100644 src/datatypes2/src/data_type.rs delete mode 100644 src/datatypes2/src/error.rs delete mode 100644 src/datatypes2/src/lib.rs delete mode 100644 src/datatypes2/src/macros.rs delete mode 100644 src/datatypes2/src/prelude.rs delete mode 100644 src/datatypes2/src/scalars.rs delete mode 100644 src/datatypes2/src/schema.rs delete mode 100644 src/datatypes2/src/schema/constraint.rs delete mode 100644 src/datatypes2/src/schema/raw.rs delete mode 100644 src/datatypes2/src/serialize.rs delete mode 100644 src/datatypes2/src/type_id.rs delete mode 100644 src/datatypes2/src/types.rs delete mode 100644 src/datatypes2/src/types/binary_type.rs delete mode 100644 src/datatypes2/src/types/boolean_type.rs delete mode 100644 src/datatypes2/src/types/list_type.rs delete mode 100644 src/datatypes2/src/types/null_type.rs delete mode 100644 src/datatypes2/src/types/primitive_type.rs delete mode 100644 src/datatypes2/src/types/string_type.rs delete mode 100644 src/datatypes2/src/value.rs delete mode 100644 src/datatypes2/src/vectors.rs delete mode 100644 src/datatypes2/src/vectors/binary.rs delete mode 100644 src/datatypes2/src/vectors/boolean.rs delete mode 100644 src/datatypes2/src/vectors/constant.rs delete mode 100644 src/datatypes2/src/vectors/date.rs delete mode 100644 src/datatypes2/src/vectors/datetime.rs delete mode 100644 src/datatypes2/src/vectors/eq.rs delete mode 100644 src/datatypes2/src/vectors/helper.rs delete mode 100644 src/datatypes2/src/vectors/list.rs delete mode 100644 src/datatypes2/src/vectors/null.rs delete mode 100644 src/datatypes2/src/vectors/operations.rs delete mode 100644 src/datatypes2/src/vectors/operations/filter.rs delete mode 100644 src/datatypes2/src/vectors/operations/find_unique.rs delete mode 100644 src/datatypes2/src/vectors/operations/replicate.rs delete mode 100644 src/datatypes2/src/vectors/primitive.rs delete mode 100644 src/datatypes2/src/vectors/string.rs delete mode 100644 src/datatypes2/src/vectors/timestamp.rs delete mode 100644 src/query/src/expr.rs diff --git a/Cargo.lock b/Cargo.lock index 86c5827f0f..7404b2a25f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,7 +35,7 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", "once_cell", "version_check", ] @@ -48,16 +48,16 @@ checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107" dependencies = [ "cfg-if 1.0.0", "const-random", - "getrandom 0.2.7", + "getrandom 0.2.8", "once_cell", "version_check", ] [[package]] name = "aho-corasick" -version = "0.7.19" +version = "0.7.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" dependencies = [ "memchr", ] @@ -68,7 +68,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "befdff0b4683a0824fc8719ce639a252d9d62cd89c8d0004c39e2417128c1eb8" dependencies = [ - "axum 0.6.1", + "axum", "bytes", "cfg-if 1.0.0", "http", @@ -123,9 +123,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.65" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602" +checksum = "216261ddc8289130e551ddcd5ce8a064710c0d064a4d2895c67151c92b5443f6" [[package]] name = "anymap" @@ -141,7 +141,7 @@ dependencies = [ "common-error", "common-time", "datatypes", - "prost 0.11.0", + "prost 0.11.3", "snafu", "tonic", "tonic-build", @@ -192,30 +192,6 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" -[[package]] -name = "arrow" -version = "10.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1328dbc6d5d76a08b13df3ac630f61a6a31276d9e9d08eb813e98efa624c2382" -dependencies = [ - "bitflags", - "chrono", - "csv", - "flatbuffers 2.1.1", - "half 1.8.2", - "hex", - "indexmap", - "lazy_static", - "lexical-core", - "multiversion", - "num", - "rand 0.8.5", - "regex", - "serde", - "serde_derive", - "serde_json", -] - [[package]] name = "arrow" version = "26.0.0" @@ -230,10 +206,11 @@ dependencies = [ "arrow-select", "bitflags", "chrono", + "comfy-table", "csv", - "flatbuffers 22.9.29", + "flatbuffers", "half 2.1.0", - "hashbrown", + "hashbrown 0.12.3", "indexmap", "lazy_static", "lexical-core", @@ -256,7 +233,7 @@ dependencies = [ "arrow-schema", "chrono", "half 2.1.0", - "hashbrown", + "hashbrown 0.12.3", "num", ] @@ -282,21 +259,14 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-format" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2333f8ccf0d597ba779863c57a0b61f635721187fb2fdeabae92691d7d582fe5" -dependencies = [ - "planus", - "serde", -] - [[package]] name = "arrow-schema" version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f9406eb7834ca6bd8350d1baa515d18b9fcec487eddacfb62f5e19511f7bd37" +dependencies = [ + "serde", +] [[package]] name = "arrow-select" @@ -311,38 +281,6 @@ dependencies = [ "num", ] -[[package]] -name = "arrow2" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e387b20dd573a96f36b173d9027483898f944d696521afd74e2caa3c813d86e" -dependencies = [ - "ahash 0.7.6", - "arrow-format", - "base64", - "bytemuck", - "chrono", - "csv", - "csv-core", - "either", - "fallible-streaming-iterator", - "futures", - "hash_hasher", - "indexmap", - "itertools", - "lexical-core", - "multiversion", - "num-traits", - "parquet2", - "regex", - "serde", - "serde_derive", - "serde_json", - "simdutf8", - "streaming-iterator", - "strength_reduce", -] - [[package]] name = "ascii" version = "1.1.0" @@ -360,9 +298,9 @@ dependencies = [ [[package]] name = "async-channel" -version = "1.7.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14485364214912d3b19cc3435dde4df66065127f05fa0d75c712f36f12c2f28" +checksum = "cf46fee83e5ccffc220104713af3292ff9bc7c64c7de289f66dae8e38d826833" dependencies = [ "concurrent-queue", "event-listener", @@ -384,13 +322,15 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345fd392ab01f746c717b1357165b76f0b67a60192007b234058c9045fdcf695" +checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a" dependencies = [ "brotli", + "bzip2", "flate2", "futures-core", + "futures-io", "memchr", "pin-project-lite", "tokio", @@ -398,9 +338,9 @@ dependencies = [ [[package]] name = "async-io" -version = "1.10.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8121296a9f05be7f34aa4196b1747243b3b62e048bb7906f644f3fbfc490cf7" +checksum = "8c374dda1ed3e7d8f0d9ba58715f924862c63eae6849c92d3a18e7fbde9e2794" dependencies = [ "async-lock", "autocfg", @@ -413,7 +353,7 @@ dependencies = [ "slab", "socket2", "waker-fn", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -449,9 +389,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.57" +version = "0.1.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f" +checksum = "31e6e93155431f3931513b243d371981bb2770112b370c82745a1d19d2f99364" dependencies = [ "proc-macro2", "quote", @@ -479,7 +419,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] @@ -496,37 +436,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" -[[package]] -name = "axum" -version = "0.5.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e3356844c4d6a6d6467b8da2cffb4a2820be256f50a3a386c9d152bab31043" -dependencies = [ - "async-trait", - "axum-core 0.2.8", - "bitflags", - "bytes", - "futures-util", - "http", - "http-body", - "hyper", - "itoa 1.0.3", - "matchit 0.5.0", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tower", - "tower-http", - "tower-layer", - "tower-service", -] - [[package]] name = "axum" version = "0.6.1" @@ -534,15 +443,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08b108ad2665fa3f6e6a517c3d80ec3e77d224c47d605167aefaa5d7ef97fa48" dependencies = [ "async-trait", - "axum-core 0.3.0", + "axum-core", "bitflags", "bytes", "futures-util", "http", "http-body", "hyper", - "itoa 1.0.3", - "matchit 0.7.0", + "itoa 1.0.4", + "matchit", "memchr", "mime", "percent-encoding", @@ -560,22 +469,6 @@ dependencies = [ "tower-service", ] -[[package]] -name = "axum-core" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "mime", - "tower-layer", - "tower-service", -] - [[package]] name = "axum-core" version = "0.3.0" @@ -608,9 +501,9 @@ dependencies = [ [[package]] name = "axum-test-helper" version = "0.1.1" -source = "git+https://github.com/sunng87/axum-test-helper.git?branch=patch-1#c90b5fed699080636330f3a97c1ee20d845329f0" +source = "git+https://github.com/sunng87/axum-test-helper.git?branch=patch-1#5aa7843ce2250144ea1b7f589f274c00cf1af4ab" dependencies = [ - "axum 0.5.16", + "axum", "bytes", "http", "http-body", @@ -629,7 +522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1" dependencies = [ "futures-core", - "getrandom 0.2.7", + "getrandom 0.2.8", "instant", "pin-project-lite", "rand 0.8.5", @@ -658,23 +551,23 @@ dependencies = [ "cc", "cfg-if 1.0.0", "libc", - "miniz_oxide", + "miniz_oxide 0.5.4", "object", "rustc-demangle", ] [[package]] name = "base64" -version = "0.13.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] name = "benchmarks" version = "0.1.0" dependencies = [ - "arrow 10.0.0", - "clap 4.0.18", + "arrow", + "clap 4.0.29", "client", "indicatif", "itertools", @@ -761,15 +654,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "bitpacking" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7" -dependencies = [ - "crunchy", -] - [[package]] name = "bitvec" version = "1.0.1" @@ -784,9 +668,9 @@ dependencies = [ [[package]] name = "blake2" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9cf849ee05b2ee5fba5e36f97ff8ec2533916700fc0758d40d92136a42f3388" +checksum = "b12e5fd123190ce1c2e559308a94c9bacad77907d4c6005d9e58fe1a0689e55e" dependencies = [ "digest", ] @@ -799,20 +683,20 @@ checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" dependencies = [ "arrayref", "arrayvec 0.5.2", - "constant_time_eq", + "constant_time_eq 0.1.5", ] [[package]] name = "blake3" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08e53fc5a564bb15bfe6fae56bd71522205f1f91893f9c0116edad6496c183f" +checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" dependencies = [ "arrayref", "arrayvec 0.7.2", "cc", "cfg-if 1.0.0", - "constant_time_eq", + "constant_time_eq 0.2.4", "digest", ] @@ -825,6 +709,51 @@ dependencies = [ "generic-array", ] +[[package]] +name = "borsh" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15bf3650200d8bffa99015595e10f1fbd17de07abbc25bb067da79e769939bfa" +dependencies = [ + "borsh-derive", + "hashbrown 0.11.2", +] + +[[package]] +name = "borsh-derive" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6441c552f230375d18e3cc377677914d2ca2b0d36e52129fe15450a2dce46775" +dependencies = [ + "borsh-derive-internal", + "borsh-schema-derive-internal", + "proc-macro-crate 0.1.5", + "proc-macro2", + "syn", +] + +[[package]] +name = "borsh-derive-internal" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5449c28a7b352f2d1e592a8a28bf139bc71afb0764a14f3c02500935d8c44065" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "borsh-schema-derive-internal" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdbd5696d8bfa21d53d9fe39a714a18538bad11492a42d066dbbc395fb1951c0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "brotli" version = "3.3.4" @@ -871,9 +800,30 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.11.0" +version = "3.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" + +[[package]] +name = "bytecheck" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d11cac2c12b5adc6570dad2ee1b87eff4955dac476fe12d81e5fdd352e52406f" +dependencies = [ + "bytecheck_derive", + "ptr_meta", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e576ebe98e605500b3c8041bb888e966653577172df6dd97398714eb30b9bf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "bytecount" @@ -881,26 +831,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" -[[package]] -name = "bytemuck" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9e1f5fa78f69496407a27ae9ed989e3c3b072310286f5ef385525e4cbc24a9" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "byteorder" version = "1.4.3" @@ -909,18 +839,33 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" +checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c" dependencies = [ "serde", ] [[package]] -name = "cache-padded" -version = "1.2.0" +name = "bzip2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1db59621ec70f09c5e9b597b220c7a2b43611f4710dc03ceb8748637775692c" +checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] [[package]] name = "cactus" @@ -1014,9 +959,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.73" +version = "1.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "e9f73505338f7d905b19d18738976aae232eb46b8efc15554ffc56deb5d9ebe4" dependencies = [ "jobserver", ] @@ -1058,9 +1003,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.22" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ "iana-time-zone", "js-sys", @@ -1155,9 +1100,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.22" +version = "3.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" +checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" dependencies = [ "atty", "bitflags", @@ -1167,19 +1112,19 @@ dependencies = [ "once_cell", "strsim 0.10.0", "termcolor", - "textwrap 0.15.1", + "textwrap 0.16.0", ] [[package]] name = "clap" -version = "4.0.18" +version = "4.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335867764ed2de42325fafe6d18b8af74ba97ee0c590fa016f157535b42ab04b" +checksum = "4d63b9e9c07271b9957ad22c173bae2a4d9a81127680962039296abcd2f8251d" dependencies = [ - "atty", "bitflags", - "clap_derive 4.0.18", + "clap_derive 4.0.21", "clap_lex 0.3.0", + "is-terminal", "once_cell", "strsim 0.10.0", "termcolor", @@ -1200,9 +1145,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.0.18" +version = "4.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16a1b0f6422af32d5da0c58e2703320f379216ee70198241c84173a8c5ac28f3" +checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014" dependencies = [ "heck 0.4.0", "proc-macro-error", @@ -1271,9 +1216,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.48" +version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" +checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" dependencies = [ "cc", ] @@ -1284,7 +1229,7 @@ version = "0.1.0" dependencies = [ "anymap", "build-data", - "clap 3.2.22", + "clap 3.2.23", "common-error", "common-telemetry", "datanode", @@ -1301,25 +1246,24 @@ dependencies = [ ] [[package]] -name = "comfy-table" -version = "5.0.1" +name = "codespan-reporting" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" dependencies = [ - "strum 0.23.0", - "strum_macros 0.23.1", + "termcolor", "unicode-width", ] [[package]] name = "comfy-table" -version = "6.1.2" +version = "6.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1090f39f45786ec6dc6286f8ea9c75d0a7ef0a0d3cda674cef0c3af7b307fbc2" +checksum = "e621e7e86c46fd8a14c32c6ae3cb95656621b4743a27d0cffedb831d46e7ad21" dependencies = [ "crossterm", - "strum 0.24.1", - "strum_macros 0.24.3", + "strum", + "strum_macros", "unicode-width", ] @@ -1370,7 +1314,7 @@ dependencies = [ "common-function-macro", "common-query", "common-time", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "libc", "num", @@ -1446,7 +1390,7 @@ dependencies = [ "common-recordbatch", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datatypes", "snafu", @@ -1460,7 +1404,7 @@ version = "0.1.0" dependencies = [ "common-error", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "futures", "paste", @@ -1518,22 +1462,22 @@ dependencies = [ [[package]] name = "concurrent-queue" -version = "1.2.4" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af4780a44ab5696ea9e28294517f1fffb421a83a25af521333c838635509db9c" +checksum = "bd7bef69dc86e3c610e4e7aed41035e2a7ed12e72dd7530f61327a6579a4390b" dependencies = [ - "cache-padded", + "crossbeam-utils", ] [[package]] name = "console" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89eab4d20ce20cea182308bca13088fecea9c05f6776cf287205d41a0ed3c847" +checksum = "c050367d967ced717c04b65d8c619d863ef9292ce0c5760028655a2fb298718c" dependencies = [ "encode_unicode", + "lazy_static", "libc", - "once_cell", "terminal_size", "unicode-width", "winapi", @@ -1545,8 +1489,8 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e57ff02e8ad8e06ab9731d5dc72dc23bef9200778eae1a89d555d8c42e5d4a86" dependencies = [ - "prost 0.11.0", - "prost-types 0.11.1", + "prost 0.11.3", + "prost-types 0.11.2", "tonic", "tracing-core", ] @@ -1563,7 +1507,7 @@ dependencies = [ "futures", "hdrhistogram", "humantime", - "prost-types 0.11.1", + "prost-types 0.11.2", "serde", "serde_json", "thread_local", @@ -1591,7 +1535,7 @@ version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", "once_cell", "proc-macro-hack", "tiny-keccak", @@ -1603,6 +1547,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" +[[package]] +name = "constant_time_eq" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ad85c1f65dc7b37604eb0e89748faf0b9653065f2a8ef69f96a687ec1e9279" + [[package]] name = "core-foundation" version = "0.9.3" @@ -1688,7 +1638,7 @@ dependencies = [ "atty", "cast", "ciborium", - "clap 3.2.22", + "clap 3.2.23", "criterion-plot 0.5.0", "itertools", "lazy_static", @@ -1761,23 +1711,22 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.10" +version = "0.9.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" +checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" dependencies = [ "autocfg", "cfg-if 1.0.0", "crossbeam-utils", - "memoffset", - "once_cell", + "memoffset 0.7.1", "scopeguard", ] [[package]] name = "crossbeam-queue" -version = "0.3.6" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd42583b04998a5363558e5f9291ee5a5ff6b49944332103f251e7479a82aa7" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -1785,12 +1734,11 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.11" +version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" +checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" dependencies = [ "cfg-if 1.0.0", - "once_cell", ] [[package]] @@ -1857,10 +1805,54 @@ dependencies = [ ] [[package]] -name = "darling" -version = "0.14.1" +name = "cxx" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4529658bdda7fd6769b8614be250cdcfc3aeb0ee72fe66f9e41e5e5eb73eac02" +checksum = "bdf07d07d6531bfcdbe9b8b739b104610c6508dcc4d63b410585faf338241daf" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2eb5b96ecdc99f72657332953d4d9c50135af1bac34277801cc3937906ebd39" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac040a39517fd1674e0f32177648334b0f4074625b5588a64519804ba0553b12" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1362b0ddcfc4eb0a1f57b68bd77dd99f0e826958a96abd0ae9bd092e114ffed6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "darling" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" dependencies = [ "darling_core", "darling_macro", @@ -1868,9 +1860,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "649c91bc01e8b1eac09fb91e8dbc7d517684ca6be8ebc75bb9cafc894f9fdb6f" +checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" dependencies = [ "fnv", "ident_case", @@ -1882,9 +1874,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc69c5bfcbd2fc09a0f38451d2daf0e372e367986a83906d1b0dbc88134fb5" +checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" dependencies = [ "darling_core", "quote", @@ -1898,7 +1890,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" dependencies = [ "cfg-if 1.0.0", - "hashbrown", + "hashbrown 0.12.3", "lock_api", "once_cell", "parking_lot_core", @@ -1906,44 +1898,47 @@ dependencies = [ [[package]] name = "datafusion" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a8411475928479fe57af18698626f0a44f3c29153e051dce45f7455c08a6d5" dependencies = [ - "ahash 0.7.6", - "arrow2", + "ahash 0.8.2", + "arrow", + "async-compression", "async-trait", + "bytes", + "bzip2", "chrono", - "comfy-table 5.0.1", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", + "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-row", + "datafusion-sql", + "flate2", "futures", - "hashbrown", + "glob", + "hashbrown 0.12.3", + "itertools", "lazy_static", "log", "num_cpus", - "ordered-float 2.10.0", + "object_store", + "ordered-float 3.4.0", "parking_lot", - "parquet2", + "parquet", "paste", + "percent-encoding", "pin-project-lite", "rand 0.8.5", "smallvec", - "sqlparser 0.15.0", + "sqlparser", "tempfile", "tokio", "tokio-stream", -] - -[[package]] -name = "datafusion-common" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" -dependencies = [ - "arrow2", - "ordered-float 2.10.0", - "parquet2", - "sqlparser 0.15.0", + "tokio-util", + "url", + "uuid", ] [[package]] @@ -1952,44 +1947,96 @@ version = "14.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15f1ffcbc1f040c9ab99f41db1c743d95aff267bb2e7286aaa010738b7402251" dependencies = [ - "arrow 26.0.0", + "arrow", "chrono", - "ordered-float 3.1.0", - "sqlparser 0.26.0", + "object_store", + "ordered-float 3.4.0", + "parquet", + "sqlparser", ] [[package]] name = "datafusion-expr" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1883d9590d303ef38fa295567e7fdb9f8f5f511fcc167412d232844678cd295c" dependencies = [ - "ahash 0.7.6", - "arrow2", - "datafusion-common 7.0.0", - "sqlparser 0.15.0", + "ahash 0.8.2", + "arrow", + "datafusion-common", + "log", + "sqlparser", +] + +[[package]] +name = "datafusion-optimizer" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2127d46d566ab3463d70da9675fc07b9d634be8d17e80d0e1ce79600709fe651" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.12.3", + "log", ] [[package]] name = "datafusion-physical-expr" -version = "7.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d108b6fe8eeb317ecad1d74619e8758de49cccc8c771b56c97962fd52eaae23" dependencies = [ - "ahash 0.7.6", - "arrow2", + "ahash 0.8.2", + "arrow", + "arrow-buffer", + "arrow-schema", "blake2", "blake3", "chrono", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", - "hashbrown", + "datafusion-row", + "half 2.1.0", + "hashbrown 0.12.3", + "itertools", "lazy_static", "md-5", - "ordered-float 2.10.0", + "num-traits", + "ordered-float 3.4.0", "paste", "rand 0.8.5", "regex", "sha2", "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-row" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43537b6377d506e4788bf21e9ed943340e076b48ca4d077e6ea4405ca5e54a1c" +dependencies = [ + "arrow", + "datafusion-common", + "paste", + "rand 0.8.5", +] + +[[package]] +name = "datafusion-sql" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244d08d4710e1088d9c0949c9b5b8d68d9cf2cde7203134a4cc389e870fe2354" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "sqlparser", ] [[package]] @@ -1998,7 +2045,7 @@ version = "0.1.0" dependencies = [ "api", "async-trait", - "axum 0.6.1", + "axum", "axum-macros", "axum-test-helper", "backon", @@ -2015,7 +2062,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "futures", "hyper", @@ -2049,34 +2096,16 @@ dependencies = [ name = "datatypes" version = "0.1.0" dependencies = [ - "arrow2", + "arrow", + "arrow-schema", "common-base", "common-error", "common-time", - "datafusion-common 7.0.0", + "datafusion-common", "enum_dispatch", "num", "num-traits", - "ordered-float 3.1.0", - "paste", - "serde", - "serde_json", - "snafu", -] - -[[package]] -name = "datatypes2" -version = "0.1.0" -dependencies = [ - "arrow 26.0.0", - "common-base", - "common-error", - "common-time", - "datafusion-common 14.0.0", - "enum_dispatch", - "num", - "num-traits", - "ordered-float 3.1.0", + "ordered-float 3.4.0", "paste", "serde", "serde_json", @@ -2133,9 +2162,9 @@ checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" [[package]] name = "digest" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" dependencies = [ "block-buffer", "crypto-common", @@ -2345,7 +2374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1259da3b15ec7e54bd7203adb2c4335adb9ca1d47b56220d650e52c247e824a" dependencies = [ "http", - "prost 0.11.0", + "prost 0.11.3", "tokio", "tokio-stream", "tonic", @@ -2372,12 +2401,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" -[[package]] -name = "fallible-streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" - [[package]] name = "fastrand" version = "1.8.0" @@ -2389,13 +2412,13 @@ dependencies = [ [[package]] name = "fd-lock" -version = "3.0.6" +version = "3.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e11dcc7e4d79a8c89b9ab4c6f5c30b1fc4a83c420792da3542fd31179ed5f517" +checksum = "bb21c69b9fea5e15dbc1049e4b77145dd0ba1c84019c488102de0dc4ea4b0a27" dependencies = [ "cfg-if 1.0.0", "rustix", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] @@ -2422,17 +2445,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda653ca797810c02f7ca4b804b40b8b95ae046eb989d356bce17919a8c25499" -[[package]] -name = "flatbuffers" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea97b4fe4b84e2f2765449bcea21cbdb3ee28cecb88afbf38a0c2e1639f5eb5" -dependencies = [ - "bitflags", - "smallvec", - "thiserror", -] - [[package]] name = "flatbuffers" version = "22.9.29" @@ -2445,13 +2457,13 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" +checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" dependencies = [ "crc32fast", "libz-sys", - "miniz_oxide", + "miniz_oxide 0.6.2", ] [[package]] @@ -2500,7 +2512,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datanode", "datatypes", @@ -2511,7 +2523,7 @@ dependencies = [ "meta-srv", "moka", "openmetrics-parser", - "prost 0.11.0", + "prost 0.11.3", "query", "rustls", "serde", @@ -2520,7 +2532,6 @@ dependencies = [ "session", "snafu", "sql", - "sqlparser 0.15.0", "store-api", "substrait 0.1.0", "table", @@ -2532,9 +2543,9 @@ dependencies = [ [[package]] name = "frunk" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cd67cf7d54b7e72d0ea76f3985c3747d74aee43e0218ad993b7903ba7a5395e" +checksum = "a89c703bf50009f383a0873845357cc400a95fc535f836feddfe015d7df6e1e0" dependencies = [ "frunk_core", "frunk_derives", @@ -2543,15 +2554,15 @@ dependencies = [ [[package]] name = "frunk_core" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1246cf43ec80bf8b2505b5c360b8fb999c97dabd17dbb604d85558d5cbc25482" +checksum = "2a446d01a558301dca28ef43222864a9fa2bd9a2e71370f769d5d5d5ec9f3537" [[package]] name = "frunk_derives" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dbc4f084ec5a3f031d24ccedeb87ab2c3189a2f33b8d070889073837d5ea09e" +checksum = "b83164912bb4c97cfe0772913c7af7387ee2e00cb6d4636fb65a35b3d0c8f173" dependencies = [ "frunk_proc_macro_helpers", "quote", @@ -2560,9 +2571,9 @@ dependencies = [ [[package]] name = "frunk_proc_macro_helpers" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99f11257f106c6753f5ffcb8e601fb39c390a088017aaa55b70c526bff15f63e" +checksum = "015425591bbeb0f5b8a75593340f1789af428e9f887a4f1e36c0c471f067ef50" dependencies = [ "frunk_core", "proc-macro2", @@ -2572,9 +2583,9 @@ dependencies = [ [[package]] name = "frunk_proc_macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a078bd8459eccbb85e0b007b8f756585762a72a9efc53f359b371c3b6351dbcc" +checksum = "ea01524f285deab48affffb342b97f186e657b119c3f1821ac531780e0fbfae0" dependencies = [ "frunk_core", "frunk_proc_macros_impl", @@ -2583,9 +2594,9 @@ dependencies = [ [[package]] name = "frunk_proc_macros_impl" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ffba99f0fa4f57e42f57388fbb9a0ca863bc2b4261f3c5570fed579d5df6c32" +checksum = "0a802d974cc18ee7fe1a7868fc9ce31086294fd96ba62f8da64ecb44e92a2653" dependencies = [ "frunk_core", "frunk_proc_macro_helpers", @@ -2608,9 +2619,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0" dependencies = [ "futures-channel", "futures-core", @@ -2623,9 +2634,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed" dependencies = [ "futures-core", "futures-sink", @@ -2633,15 +2644,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" +checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac" [[package]] name = "futures-executor" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2" dependencies = [ "futures-core", "futures-task", @@ -2650,9 +2661,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" +checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb" [[package]] name = "futures-lite" @@ -2671,9 +2682,9 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d" dependencies = [ "proc-macro2", "quote", @@ -2682,21 +2693,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" +checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9" [[package]] name = "futures-task" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" +checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea" [[package]] name = "futures-util" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6" dependencies = [ "futures-channel", "futures-core", @@ -2752,9 +2763,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -2789,9 +2800,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -2823,10 +2834,13 @@ dependencies = [ ] [[package]] -name = "hash_hasher" -version = "2.0.3" +name = "hashbrown" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash 0.7.6", +] [[package]] name = "hashbrown" @@ -2874,6 +2888,15 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + [[package]] name = "hex" version = "0.4.3" @@ -2903,7 +2926,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.3", + "itoa 1.0.4", ] [[package]] @@ -2953,9 +2976,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.20" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -2966,7 +2989,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.3", + "itoa 1.0.4", "pin-project-lite", "socket2", "tokio", @@ -2977,9 +3000,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.0" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" +checksum = "59df7c4e19c950e6e0e868dcc0a300b09a9b88e9ec55bd879ca819087a77355d" dependencies = [ "http", "hyper", @@ -3002,17 +3025,28 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.49" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bbaead50122b06e9a973ac20bc7445074d99ad9a0a0654934876908a9cec82c" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" dependencies = [ "android_system_properties", "core-foundation-sys", + "iana-time-zone-haiku", "js-sys", "wasm-bindgen", "winapi", ] +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -3031,23 +3065,24 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.1" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.3", "serde", ] [[package]] name = "indicatif" -version = "0.17.1" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfddc9561e8baf264e0e45e197fd7696320026eb10a8180340debc27b18f535b" +checksum = "4295cbb7573c16d310e99e713cf9e75101eb190ab31fccd35f2d2691b4352b19" dependencies = [ "console", "number_prefix", + "portable-atomic", "unicode-width", ] @@ -3071,12 +3106,6 @@ dependencies = [ "cfg-if 1.0.0", ] -[[package]] -name = "integer-encoding" -version = "1.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f" - [[package]] name = "integer-encoding" version = "3.0.4" @@ -3089,15 +3118,19 @@ dependencies = [ [[package]] name = "io-lifetimes" -version = "0.7.3" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ea37f355c05dde75b84bba2d767906ad522e97cd9e2eef2be7a4ab7fb442c06" +checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" +dependencies = [ + "libc", + "windows-sys 0.42.0", +] [[package]] name = "ipnet" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" +checksum = "f88c5561171189e69df9d98bcf18fd5f9558300f7ea7b801eb8a0fd748bd8745" [[package]] name = "iri-string" @@ -3121,6 +3154,18 @@ dependencies = [ "syn", ] +[[package]] +name = "is-terminal" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927609f78c2913a6f6ac3c27a4fe87f43e2a35367c0c4b0f8265e8f49a104330" +dependencies = [ + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys 0.42.0", +] + [[package]] name = "itertools" version = "0.10.5" @@ -3138,15 +3183,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" [[package]] name = "jobserver" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +checksum = "068b1ee6743e4d11fb9c6a1e6064b3693a1b600e7f5f5988047d98b3dc9fb90b" dependencies = [ "libc", ] @@ -3162,9 +3207,9 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.1.1" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c" +checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" dependencies = [ "base64", "pem", @@ -3302,15 +3347,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.133" +version = "0.2.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0f80d65747a3e43d1596c7c5492d95d5edddaabd45a7fcdb02b95f644164966" +checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8" [[package]] name = "libloading" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ "cfg-if 1.0.0", "winapi", @@ -3318,9 +3363,9 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" +checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" [[package]] name = "libz-sys" @@ -3334,10 +3379,19 @@ dependencies = [ ] [[package]] -name = "linux-raw-sys" -version = "0.0.46" +name = "link-cplusplus" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4d2456c373231a208ad294c33dc5bff30051eafd954cd4caae83a712b12854d" +checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369" +dependencies = [ + "cc", +] + +[[package]] +name = "linux-raw-sys" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f9f08d8963a6c613f4b1a78f4f4a4dbfadf8e6545b2d72861731e4858b8b47f" [[package]] name = "lock_api" @@ -3443,7 +3497,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909" dependencies = [ - "hashbrown", + "hashbrown 0.12.3", ] [[package]] @@ -3481,7 +3535,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b238e3235c8382b7653c6408ed1b08dd379bdb9fdf990fb0bbae3db2cc0ae963" dependencies = [ - "nix 0.23.1", + "nix 0.23.2", "winapi", ] @@ -3515,12 +3569,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" -[[package]] -name = "matchit" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" - [[package]] name = "matchit" version = "0.7.0" @@ -3575,6 +3623,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "meta-client" version = "0.1.0" @@ -3618,7 +3675,7 @@ dependencies = [ "http-body", "lazy_static", "parking_lot", - "prost 0.11.0", + "prost 0.11.3", "regex", "serde", "serde_json", @@ -3677,7 +3734,7 @@ checksum = "f7d24dc2dbae22bff6f1f9326ffce828c9f07ef9cc1e8002e5279f845432a30a" dependencies = [ "crossbeam-epoch", "crossbeam-utils", - "hashbrown", + "hashbrown 0.12.3", "metrics", "num_cpus", "parking_lot", @@ -3718,15 +3775,24 @@ dependencies = [ ] [[package]] -name = "mio" -version = "0.8.4" +name = "miniz_oxide" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] @@ -3744,7 +3810,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datatypes", "futures", "log-store", @@ -3822,9 +3888,9 @@ dependencies = [ [[package]] name = "mysql_async" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fbd756177cfa8248baa7c5f555b9446349822bb94810c22336ec7597a72652" +checksum = "52d8156a1f6a19224593c556c8aac642cf8070abd53d563405da92879dcf341b" dependencies = [ "bytes", "crossbeam", @@ -3840,6 +3906,7 @@ dependencies = [ "pem", "percent-encoding", "pin-project", + "priority-queue", "rustls", "rustls-pemfile", "serde", @@ -3889,7 +3956,7 @@ dependencies = [ "smallvec", "subprocess", "thiserror", - "time 0.3.14", + "time 0.3.17", "uuid", ] @@ -3939,27 +4006,27 @@ dependencies = [ [[package]] name = "nix" -version = "0.23.1" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" dependencies = [ "bitflags", "cc", "cfg-if 1.0.0", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] name = "nix" -version = "0.24.2" +version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "195cdbc1741b8134346d515b3a56a1c94b0912758009cfd53f99ea0f57b065fc" +checksum = "fa52e972a9a719cecb6864fb88568781eb706bac2cd1d4f04a648542dbf78069" dependencies = [ "bitflags", "cfg-if 1.0.0", "libc", - "memoffset", + "memoffset 0.6.5", ] [[package]] @@ -3972,6 +4039,16 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.0" @@ -4053,11 +4130,11 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.13.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +checksum = "f6058e64324c71e02bc2b150e4f3bc8286db6c83092132ffa3f6b1eab0f9def5" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", ] @@ -4076,21 +4153,12 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b0498641e53dd6ac1a4f22547548caa6864cc4933784319cd1775271c5a46ce" dependencies = [ - "proc-macro-crate", + "proc-macro-crate 1.2.1", "proc-macro2", "quote", "syn", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -4119,6 +4187,26 @@ dependencies = [ "uuid", ] +[[package]] +name = "object_store" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0014545954c5023b5fb8260415e54467cde434db6c824c9028a4b329f1b28e48" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "itertools", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + [[package]] name = "once_cell" version = "1.16.0" @@ -4159,7 +4247,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "time 0.3.14", + "time 0.3.17", "tokio", "tracing", "ureq", @@ -4261,18 +4349,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "2.10.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" -dependencies = [ - "num-traits", -] - -[[package]] -name = "ordered-float" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a" +checksum = "d84eb1409416d254e4a9c8fa56cc24701755025b458f0fcd8e59e1f5f40c23bf" dependencies = [ "num-traits", "serde", @@ -4285,14 +4364,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccd746e37177e1711c20dd619a1620f34f5c8b569c53590a72dedd5344d8924a" dependencies = [ "dlv-list", - "hashbrown", + "hashbrown 0.12.3", ] [[package]] name = "os_str_bytes" -version = "6.3.0" +version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "packedvec" @@ -4332,9 +4417,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" +checksum = "7ff9f3fef3968a3ec5945535ed654cb38ff72d7495a25619e2247fb15a2ed9ba" dependencies = [ "backtrace", "cfg-if 1.0.0", @@ -4343,40 +4428,34 @@ dependencies = [ "redox_syscall 0.2.16", "smallvec", "thread-id", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] name = "parquet" -version = "10.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53e9c8fc20af9b92d85d42ec86e5217b2eaf1340fbba75c4b4296de764ea7921" +checksum = "3bf8fa7ab6572791325a8595f55dc532dde88b996ae10a5ca8a2db746784ecc4" dependencies = [ - "arrow 10.0.0", + "ahash 0.8.2", + "arrow", "base64", "brotli", - "byteorder", + "bytes", "chrono", "flate2", + "futures", + "hashbrown 0.12.3", "lz4", "num", "num-bigint", - "parquet-format", - "rand 0.8.5", + "seq-macro", "snap", - "thrift 0.13.0", + "thrift 0.16.0", + "tokio", "zstd", ] -[[package]] -name = "parquet-format" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f0c06cdcd5460967c485f9c40a821746f5955ad81990533c7fae95dbd9bc0b5" -dependencies = [ - "thrift 0.13.0", -] - [[package]] name = "parquet-format-async-temp" version = "0.2.0" @@ -4386,28 +4465,10 @@ dependencies = [ "async-trait", "byteorder", "futures", - "integer-encoding 3.0.4", + "integer-encoding", "ordered-float 1.1.1", ] -[[package]] -name = "parquet2" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b085f9e78e4842865151b693f6d94bdf7b280af66daa6e3587adeb3106a07e9" -dependencies = [ - "async-stream", - "bitpacking", - "brotli", - "flate2", - "futures", - "lz4", - "parquet-format-async-temp", - "snap", - "streaming-decompression", - "zstd", -] - [[package]] name = "parse-zoneinfo" version = "0.3.0" @@ -4446,9 +4507,9 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "pest" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc7bc69c062e492337d74d59b120c274fd3d261b6bf6d3207d499b4b379c41a" +checksum = "cc8bed3549e0f9b0a2a78bf7c0018237a2cdf085eecbbc048e52612438e4e9d0" dependencies = [ "thiserror", "ucd-trie", @@ -4456,9 +4517,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b75706b9642ebcb34dab3bc7750f811609a0eb1dd8b88c2d15bf628c1c65b2" +checksum = "cdc078600d06ff90d4ed238f0119d84ab5d43dbaad278b0e33a8820293b32344" dependencies = [ "pest", "pest_generator", @@ -4466,9 +4527,9 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f9272122f5979a6511a749af9db9bfc810393f63119970d7085fed1c4ea0db" +checksum = "28a1af60b1c4148bb269006a750cff8e2ea36aff34d2d96cf7be0b14d1bed23c" dependencies = [ "pest", "pest_meta", @@ -4479,9 +4540,9 @@ dependencies = [ [[package]] name = "pest_meta" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8717927f9b79515e565a64fe46c38b8cd0427e64c40680b14a7365ab09ac8d" +checksum = "fec8605d59fc2ae0c6c1aefc0c7c7a9769732017c0ce07f7a9cfffa7b4404f20" dependencies = [ "once_cell", "pest", @@ -4515,7 +4576,7 @@ dependencies = [ "postgres-types", "rand 0.8.5", "thiserror", - "time 0.3.14", + "time 0.3.17", "tokio", "tokio-rustls", "tokio-util", @@ -4638,9 +4699,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] name = "planus" @@ -4692,16 +4753,16 @@ dependencies = [ [[package]] name = "polling" -version = "2.4.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab4609a838d88b73d8238967b60dd115cc08d38e2bbaf51ee1e4b695f89122e2" +checksum = "166ca89eb77fd403230b9c156612965a81e094ec6ec3aa13663d4c8b113fa748" dependencies = [ "autocfg", "cfg-if 1.0.0", "libc", "log", "wepoll-ffi", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -4741,9 +4802,9 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "precomputed-hash" @@ -4764,9 +4825,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.19" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a49e86d2c26a24059894a3afa13fd17d063419b05dfb83f06d9c3566060c3f5a" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" dependencies = [ "proc-macro2", "syn", @@ -4786,6 +4847,25 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "priority-queue" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7685ca4cc0b3ad748c22ce6803e23b55b9206ef7715b965ebeaf41639238fdc" +dependencies = [ + "autocfg", + "indexmap", +] + +[[package]] +name = "proc-macro-crate" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785" +dependencies = [ + "toml", +] + [[package]] name = "proc-macro-crate" version = "1.2.1" @@ -4829,9 +4909,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.43" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ "unicode-ident", ] @@ -4869,12 +4949,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.11.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "399c3c31cdec40583bb68f0b18403400d01ec4289c383aa047560439952c4dd7" +checksum = "c0b18e655c21ff5ac2084a5ad0611e827b3f92badf79f4910b5a5c58f4d87ff0" dependencies = [ "bytes", - "prost-derive 0.11.0", + "prost-derive 0.11.2", ] [[package]] @@ -4899,9 +4979,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.11.1" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f835c582e6bd972ba8347313300219fed5bfa52caf175298d860b61ff6069bb" +checksum = "e330bf1316db56b12c2bcfa399e8edddd4821965ea25ddb2c134b610b1c1c604" dependencies = [ "bytes", "heck 0.4.0", @@ -4910,9 +4990,11 @@ dependencies = [ "log", "multimap", "petgraph", - "prost 0.11.0", - "prost-types 0.11.1", + "prettyplease", + "prost 0.11.3", + "prost-types 0.11.2", "regex", + "syn", "tempfile", "which", ] @@ -4932,9 +5014,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.11.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7345d5f0e08c0536d7ac7229952590239e77abf0a0100a1b1d890add6ea96364" +checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" dependencies = [ "anyhow", "itertools", @@ -4955,12 +5037,32 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dfaa718ad76a44b3415e6c4d53b17c8f99160dcb3a99b10470fce8ad43f6e3e" +checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" dependencies = [ "bytes", - "prost 0.11.0", + "prost 0.11.3", +] + +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -5013,8 +5115,11 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", + "datafusion-expr", + "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-sql", "datatypes", "format_num", "futures", @@ -5128,7 +5233,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", ] [[package]] @@ -5158,11 +5263,10 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" -version = "1.5.3" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +checksum = "1e060280438193c554f654141c9ea9417886713b7acd75974c85b18a69a88e0b" dependencies = [ - "autocfg", "crossbeam-deque", "either", "rayon-core", @@ -5170,9 +5274,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.3" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -5221,16 +5325,16 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ - "getrandom 0.2.7", + "getrandom 0.2.8", "redox_syscall 0.2.16", "thiserror", ] [[package]] name = "regex" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" dependencies = [ "aho-corasick", "memchr", @@ -5248,9 +5352,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.27" +version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" [[package]] name = "remove_dir_all" @@ -5261,6 +5365,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "rend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79af64b4b6362ffba04eef3a4e10829718a4896dac19daa741851c86781edf95" +dependencies = [ + "bytecheck", +] + [[package]] name = "reqsign" version = "0.6.9" @@ -5286,15 +5399,15 @@ dependencies = [ "serde_json", "sha1", "sha2", - "time 0.3.14", + "time 0.3.17", "ureq", ] [[package]] name = "reqwest" -version = "0.11.12" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc" +checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c" dependencies = [ "base64", "bytes", @@ -5368,6 +5481,31 @@ dependencies = [ "winapi", ] +[[package]] +name = "rkyv" +version = "0.7.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cec2b3485b07d96ddfd3134767b8a447b45ea4eb91448d0a35180ec0ffd5ed15" +dependencies = [ + "bytecheck", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eaedadc88b53e36dd32d940ed21ae4d850d5916f2581526921f553a72ac34c4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ron" version = "0.7.1" @@ -5387,7 +5525,7 @@ checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb" dependencies = [ "base64", "blake2b_simd", - "constant_time_eq", + "constant_time_eq 0.1.5", "crossbeam-utils", ] @@ -5403,13 +5541,20 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.26.1" +version = "1.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee9164faf726e4f3ece4978b25ca877ddc6802fa77f38cdccb32c7f805ecd70c" +checksum = "33c321ee4e17d2b7abe12b5d20c1231db708dd36185c8a21e9de5fed6da4dbe9" dependencies = [ "arrayvec 0.7.2", + "borsh", + "bytecheck", + "byteorder", + "bytes", "num-traits", + "rand 0.8.5", + "rkyv", "serde", + "serde_json", ] [[package]] @@ -5444,16 +5589,16 @@ dependencies = [ [[package]] name = "rustix" -version = "0.35.10" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af895b90e5c071badc3136fc10ff0bcfc98747eadbaf43ed8f214e07ba8f8477" +checksum = "a3807b5d10909833d3e9acd1eb5fb988f79376ff10fce42937de71a449c4c588" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.36.1", + "windows-sys 0.42.0", ] [[package]] @@ -5583,7 +5728,7 @@ dependencies = [ "rustpython-doc", "syn", "syn-ext", - "textwrap 0.15.1", + "textwrap 0.15.2", ] [[package]] @@ -5655,7 +5800,7 @@ dependencies = [ "memchr", "memmap2", "mt19937", - "nix 0.24.2", + "nix 0.24.3", "num-bigint", "num-complex", "num-integer", @@ -5709,7 +5854,7 @@ dependencies = [ "crossbeam-utils", "exitcode", "flate2", - "getrandom 0.2.7", + "getrandom 0.2.8", "glob", "half 1.8.2", "hex", @@ -5720,8 +5865,8 @@ dependencies = [ "libc", "log", "memchr", - "memoffset", - "nix 0.24.2", + "memoffset 0.6.5", + "nix 0.24.3", "num-bigint", "num-complex", "num-integer", @@ -5748,8 +5893,8 @@ dependencies = [ "serde", "sre-engine", "static_assertions", - "strum 0.24.1", - "strum_macros 0.24.3", + "strum", + "strum_macros", "thiserror", "thread_local", "timsort", @@ -5787,7 +5932,7 @@ dependencies = [ "libc", "log", "memchr", - "nix 0.24.2", + "nix 0.24.3", "radix_trie", "scopeguard", "unicode-segmentation", @@ -5920,6 +6065,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scratch" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898" + [[package]] name = "script" version = "0.1.0" @@ -5935,7 +6086,7 @@ dependencies = [ "common-time", "console", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datafusion-physical-expr", "datatypes", @@ -5975,6 +6126,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "security-framework" version = "2.7.0" @@ -6026,10 +6183,16 @@ dependencies = [ ] [[package]] -name = "serde" -version = "1.0.145" +name = "seq-macro" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +checksum = "0772c5c30e1a0d91f6834f8e545c69281c099dfa9a3ac58d96a9fd629c8d4898" + +[[package]] +name = "serde" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "256b9932320c590e707b94576e3cc1f7c9024d0ee6612dfbcf1cb106cbe8e055" dependencies = [ "serde_derive", ] @@ -6046,9 +6209,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.145" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" +checksum = "b4eae9b04cbffdfd550eb462ed33bc6a1b68c935127d008b27444d08380f94e4" dependencies = [ "proc-macro2", "quote", @@ -6068,12 +6231,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.85" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" +checksum = "020ff22c755c2ed3f8cf162dbb41a7268d934702f3ed3631656ea597e08fc3db" dependencies = [ - "indexmap", - "itoa 1.0.3", + "itoa 1.0.4", "ryu", "serde", ] @@ -6094,7 +6256,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.3", + "itoa 1.0.4", "ryu", "serde", ] @@ -6106,7 +6268,7 @@ dependencies = [ "aide", "api", "async-trait", - "axum 0.6.1", + "axum", "axum-macros", "axum-test-helper", "base64", @@ -6136,7 +6298,7 @@ dependencies = [ "openmetrics-parser", "opensrv-mysql", "pgwire", - "prost 0.11.0", + "prost 0.11.3", "query", "rand 0.8.5", "regex", @@ -6150,7 +6312,7 @@ dependencies = [ "sha1", "snafu", "snap", - "strum 0.24.1", + "strum", "table", "tempdir", "tokio", @@ -6175,9 +6337,9 @@ dependencies = [ [[package]] name = "sha-1" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "028f48d513f9678cda28f6e4064755b3fbb2af6acd672f2c209b62323f7aea0f" +checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" dependencies = [ "cfg-if 1.0.0", "cpufeatures", @@ -6273,12 +6435,6 @@ dependencies = [ "paste", ] -[[package]] -name = "simdutf8" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" - [[package]] name = "simple_asn1" version = "0.6.2" @@ -6288,7 +6444,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.14", + "time 0.3.17", ] [[package]] @@ -6340,15 +6496,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "snafu" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5177903bf45656592d9eb5c0e22f408fc023aae51dbe2088889b71633ba451f2" +checksum = "a152ba99b054b22972ee794cf04e5ef572da1229e33b65f3c57abbff0525a454" dependencies = [ "backtrace", "doc-comment", @@ -6357,9 +6513,9 @@ dependencies = [ [[package]] name = "snafu-derive" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410b26ed97440d90ced3e2488c868d56a86e2064f5d7d6f417909b286afe25e5" +checksum = "d5e79cdebbabaebb06a9bdbaedc7f159b410461f63611d4d0e3fb0fab8fed850" dependencies = [ "heck 0.4.0", "proc-macro2", @@ -6369,9 +6525,9 @@ dependencies = [ [[package]] name = "snap" -version = "1.0.5" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" +checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" [[package]] name = "socket2" @@ -6415,7 +6571,7 @@ dependencies = [ "mito", "once_cell", "snafu", - "sqlparser 0.15.0", + "sqlparser", ] [[package]] @@ -6439,20 +6595,11 @@ version = "0.1.0" dependencies = [ "async-trait", "client", - "comfy-table 6.1.2", + "comfy-table", "sqlness", "tokio", ] -[[package]] -name = "sqlparser" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adbbea2526ad0d02ad9414a07c396078a5b944bbf9ca4fbab8f01bb4cb579081" -dependencies = [ - "log", -] - [[package]] name = "sqlparser" version = "0.26.0" @@ -6507,7 +6654,7 @@ name = "storage" version = "0.1.0" dependencies = [ "arc-swap", - "arrow-format", + "async-compat", "async-stream", "async-trait", "atomic_float", @@ -6525,9 +6672,10 @@ dependencies = [ "lazy_static", "log-store", "object-store", + "parquet", "paste", "planus", - "prost 0.11.0", + "prost 0.11.3", "rand 0.8.5", "regex", "serde", @@ -6569,21 +6717,6 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0" -[[package]] -name = "streaming-decompression" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3" -dependencies = [ - "fallible-streaming-iterator", -] - -[[package]] -name = "streaming-iterator" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0085b81d5d4e57f264d492641cf80ea508c96d9a0e47c6296e8f016504e28fd7" - [[package]] name = "streaming-stats" version = "0.2.3" @@ -6593,12 +6726,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "strength_reduce" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3ff2f71c82567c565ba4b3009a9350a96a7269eaa4001ebedae926230bc2254" - [[package]] name = "string_cache" version = "0.8.4" @@ -6658,32 +6785,13 @@ dependencies = [ "syn", ] -[[package]] -name = "strum" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" - [[package]] name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" dependencies = [ - "strum_macros 0.24.3", -] - -[[package]] -name = "strum_macros" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" -dependencies = [ - "heck 0.3.3", - "proc-macro2", - "quote", - "rustversion", - "syn", + "strum_macros", ] [[package]] @@ -6749,9 +6857,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "syn" -version = "1.0.100" +version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52205623b1b0f064a4e71182c3b18ae902267282930c6d5462c91b859668426e" +checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908" dependencies = [ "proc-macro2", "quote", @@ -6806,11 +6914,12 @@ dependencies = [ "common-recordbatch", "common-telemetry", "datafusion", - "datafusion-common 7.0.0", + "datafusion-common", "datafusion-expr", "datatypes", "derive_builder", "futures", + "parquet", "parquet-format-async-temp", "paste", "serde", @@ -6912,7 +7021,7 @@ name = "tests-integration" version = "0.1.0" dependencies = [ "api", - "axum 0.6.1", + "axum", "axum-test-helper", "catalog", "client", @@ -6950,24 +7059,30 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" +checksum = "b7b3e525a49ec206798b40326a44121291b530c963cfb01018f63e135bac543d" + +[[package]] +name = "textwrap" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.35" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c53f98874615aea268107765aa1ed8f6116782501d18e53d08b471733bea6c85" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.35" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8b463991b4eab2d801e724172285ec4195c650e8ec79b149e6c2a8e6dd3f783" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" dependencies = [ "proc-macro2", "quote", @@ -7005,12 +7120,12 @@ dependencies = [ [[package]] name = "thrift" -version = "0.13.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b" +checksum = "b82ca8f46f95b3ce96081fe3dd89160fdea970c254bb72925255d1b62aae692e" dependencies = [ "byteorder", - "integer-encoding 1.1.7", + "integer-encoding", "log", "ordered-float 1.1.1", "threadpool", @@ -7018,15 +7133,13 @@ dependencies = [ [[package]] name = "thrift" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b82ca8f46f95b3ce96081fe3dd89160fdea970c254bb72925255d1b62aae692e" +checksum = "09678c4cdbb4eed72e18b7c2af1329c69825ed16fcbac62d083fc3e2b0590ff0" dependencies = [ "byteorder", - "integer-encoding 3.0.4", - "log", + "integer-encoding", "ordered-float 1.1.1", - "threadpool", ] [[package]] @@ -7041,22 +7154,30 @@ dependencies = [ [[package]] name = "time" -version = "0.3.14" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c3f9a28b618c3a6b9251b6908e9c99e04b9e5c02e6581ccbb67d59c34ef7f9b" +checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" dependencies = [ - "itoa 1.0.3", - "libc", - "num_threads", + "itoa 1.0.4", "serde", + "time-core", "time-macros", ] [[package]] -name = "time-macros" -version = "0.2.4" +name = "time-core" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" + +[[package]] +name = "time-macros" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +dependencies = [ + "time-core", +] [[package]] name = "timsort" @@ -7100,9 +7221,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.21.1" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95" +checksum = "eab6d665857cc6ca78d6e80303a02cea7a7851e85dfbd77cbdc09bd129f1ef46" dependencies = [ "autocfg", "bytes", @@ -7110,14 +7231,13 @@ dependencies = [ "memchr", "mio", "num_cpus", - "once_cell", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", "tracing", - "winapi", + "windows-sys 0.42.0", ] [[package]] @@ -7132,9 +7252,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.8.0" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" +checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" dependencies = [ "proc-macro2", "quote", @@ -7192,9 +7312,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6edf2d6bc038a43d31353570e27270603f4648d18f5ed10c0e179abe43255af" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -7240,13 +7360,13 @@ dependencies = [ [[package]] name = "tonic" -version = "0.8.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11cd56bdb54ef93935a6a79dbd1d91f1ebd4c64150fd61654031fd6b8b775c91" +checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" dependencies = [ "async-stream", "async-trait", - "axum 0.5.16", + "axum", "base64", "bytes", "futures-core", @@ -7258,8 +7378,8 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost 0.11.0", - "prost-derive 0.11.0", + "prost 0.11.3", + "prost-derive 0.11.2", "tokio", "tokio-stream", "tokio-util", @@ -7272,13 +7392,13 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.8.0" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fbcd2800e34e743b9ae795867d5f77b535d3a3be69fd731e39145719752df8c" +checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ "prettyplease", "proc-macro2", - "prost-build 0.11.1", + "prost-build 0.11.3", "quote", "syn", ] @@ -7290,8 +7410,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0455f730d540a1484bffc3c55c94100b18a662597b982c2e9073f2c55c602616" dependencies = [ "bytes", - "prost 0.11.0", - "prost-types 0.11.1", + "prost 0.11.3", + "prost-types 0.11.2", "tokio", "tokio-stream", "tonic", @@ -7321,9 +7441,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" dependencies = [ "async-compression", "base64", @@ -7363,9 +7483,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.36" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if 1.0.0", "log", @@ -7381,15 +7501,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e" dependencies = [ "crossbeam-channel", - "time 0.3.14", + "time 0.3.17", "tracing-subscriber", ] [[package]] name = "tracing-attributes" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" +checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", @@ -7398,15 +7518,15 @@ dependencies = [ [[package]] name = "tracing-bunyan-formatter" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a788f2119fde477cd33823330c14004fa8cdac6892fd6f12181bbda9dbf14fc9" +checksum = "a2445962f94a813b2aaea29ceeccb6dce9fd3aa5b1cb45595cde755b00d021ad" dependencies = [ "gethostname", "log", "serde", "serde_json", - "time 0.3.14", + "time 0.3.17", "tracing", "tracing-core", "tracing-log", @@ -7415,9 +7535,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.29" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ "once_cell", "valuable", @@ -7462,12 +7582,12 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.15" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60db860322da191b40952ad9affe65ea23e7dd6a5c442c2c42865810c6ab8e6b" +checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" dependencies = [ - "ansi_term", "matchers", + "nu-ansi-term", "once_cell", "regex", "sharded-slab", @@ -7512,9 +7632,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "ucd-trie" @@ -7679,9 +7799,9 @@ checksum = "623f59e6af2a98bdafeb93fa277ac8e1e40440973001ca15cf4ae1541cd16d56" [[package]] name = "unicode-ident" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" @@ -7758,12 +7878,12 @@ checksum = "936e4b492acfd135421d8dca4b1aa80a7bfc26e702ef3af710e0752684df5372" [[package]] name = "uuid" -version = "1.1.2" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd6469f4314d5f1ffec476e05f17cc9a78bc7a27a6a857842170bdf8d6f98d2f" +checksum = "422ee0de9031b5b948b97a8fc04e3aa35230001a722ddd27943e0be31564ce4c" dependencies = [ "atomic", - "getrandom 0.2.7", + "getrandom 0.2.8", "rand 0.8.5", "serde", "uuid-macro-internal", @@ -7810,7 +7930,7 @@ dependencies = [ "getset", "rustversion", "thiserror", - "time 0.3.14", + "time 0.3.17", ] [[package]] @@ -7975,9 +8095,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1c760f0d366a6c24a02ed7816e23e691f5d92291f94d15e836006fd11b04daf" +checksum = "368bfe657969fb01238bb756d351dcade285e0f6fcbd36dcb23359a5169975be" dependencies = [ "webpki", ] @@ -8193,9 +8313,9 @@ dependencies = [ [[package]] name = "wyz" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b31594f29d27036c383b53b59ed3476874d518f0efb151b27a4c275141390e" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" dependencies = [ "tap", ] @@ -8208,18 +8328,18 @@ checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" [[package]] name = "zstd" -version = "0.10.2+zstd.1.5.2" +version = "0.11.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4a6bd64f22b5e3e94b4e238669ff9f10815c27a5180108b849d24174a83847" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "4.1.6+zstd.1.5.2" +version = "5.0.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b61c51bb270702d6167b8ce67340d2754b088d0c091b06e593aa772c3ee9bb" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" dependencies = [ "libc", "zstd-sys", @@ -8227,9 +8347,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "1.6.3+zstd.1.5.2" +version = "2.0.4+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" +checksum = "4fa202f2ef00074143e219d15b62ffc317d17cc33909feac471c044087cad7b0" dependencies = [ "cc", "libc", diff --git a/Cargo.toml b/Cargo.toml index a960138d4b..80a592b464 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ members = [ "src/common/time", "src/datanode", "src/datatypes", - "src/datatypes2", "src/frontend", "src/log-store", "src/meta-client", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 2d2819178e..ea8d78ef52 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -5,10 +5,10 @@ edition = "2021" license = "Apache-2.0" [dependencies] -arrow = "10" +arrow = "26.0.0" clap = { version = "4.0", features = ["derive"] } client = { path = "../src/client" } indicatif = "0.17.1" itertools = "0.10.5" -parquet = { version = "*" } +parquet = "26.0.0" tokio = { version = "1.21", features = ["full"] } diff --git a/benchmarks/src/bin/nyc-taxi.rs b/benchmarks/src/bin/nyc-taxi.rs index 0ca1f33182..f39b48c87e 100644 --- a/benchmarks/src/bin/nyc-taxi.rs +++ b/benchmarks/src/bin/nyc-taxi.rs @@ -20,7 +20,6 @@ use std::collections::HashMap; use std::path::{Path, PathBuf}; -use std::sync::Arc; use std::time::Instant; use arrow::array::{ArrayRef, PrimitiveArray, StringArray, TimestampNanosecondArray}; @@ -32,9 +31,7 @@ use client::api::v1::column::Values; use client::api::v1::{Column, ColumnDataType, ColumnDef, CreateExpr, InsertExpr}; use client::{Client, Database, Select}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; -use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; -use parquet::file::reader::FileReader; -use parquet::file::serialized_reader::SerializedFileReader; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use tokio::task::JoinSet; const DATABASE_NAME: &str = "greptime"; @@ -86,10 +83,14 @@ async fn write_data( pb_style: ProgressStyle, ) -> u128 { let file = std::fs::File::open(&path).unwrap(); - let file_reader = Arc::new(SerializedFileReader::new(file).unwrap()); - let row_num = file_reader.metadata().file_metadata().num_rows(); - let record_batch_reader = ParquetFileArrowReader::new(file_reader) - .get_record_reader(batch_size) + let record_batch_reader_builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let row_num = record_batch_reader_builder + .metadata() + .file_metadata() + .num_rows(); + let record_batch_reader = record_batch_reader_builder + .with_batch_size(batch_size) + .build() .unwrap(); let progress_bar = mpb.add(ProgressBar::new(row_num as _)); progress_bar.set_style(pb_style); @@ -210,9 +211,10 @@ fn build_values(column: &ArrayRef) -> Values { | DataType::FixedSizeList(_, _) | DataType::LargeList(_) | DataType::Struct(_) - | DataType::Union(_, _) + | DataType::Union(_, _, _) | DataType::Dictionary(_, _) - | DataType::Decimal(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) | DataType::Map(_, _) => todo!(), } } diff --git a/src/api/greptime/v1/column.proto b/src/api/greptime/v1/column.proto index ec6993abe9..6f5692747e 100644 --- a/src/api/greptime/v1/column.proto +++ b/src/api/greptime/v1/column.proto @@ -32,7 +32,10 @@ message Column { repeated int32 date_values = 14; repeated int64 datetime_values = 15; - repeated int64 ts_millis_values = 16; + repeated int64 ts_second_values = 16; + repeated int64 ts_millisecond_values = 17; + repeated int64 ts_microsecond_values = 18; + repeated int64 ts_nanosecond_values = 19; } // The array of non-null values in this column. // @@ -75,5 +78,8 @@ enum ColumnDataType { STRING = 12; DATE = 13; DATETIME = 14; - TIMESTAMP = 15; + TIMESTAMP_SECOND = 15; + TIMESTAMP_MILLISECOND = 16; + TIMESTAMP_MICROSECOND = 17; + TIMESTAMP_NANOSECOND = 18; } diff --git a/src/api/src/helper.rs b/src/api/src/helper.rs index cdcf704c8d..72fb0c507b 100644 --- a/src/api/src/helper.rs +++ b/src/api/src/helper.rs @@ -15,6 +15,7 @@ use common_base::BitVec; use common_time::timestamp::TimeUnit; use datatypes::prelude::ConcreteDataType; +use datatypes::types::TimestampType; use datatypes::value::Value; use datatypes::vectors::VectorRef; use snafu::prelude::*; @@ -56,7 +57,16 @@ impl From for ConcreteDataType { ColumnDataType::String => ConcreteDataType::string_datatype(), ColumnDataType::Date => ConcreteDataType::date_datatype(), ColumnDataType::Datetime => ConcreteDataType::datetime_datatype(), - ColumnDataType::Timestamp => ConcreteDataType::timestamp_millis_datatype(), + ColumnDataType::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + ColumnDataType::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + ColumnDataType::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + ColumnDataType::TimestampNanosecond => { + ConcreteDataType::timestamp_nanosecond_datatype() + } } } } @@ -81,7 +91,12 @@ impl TryFrom for ColumnDataTypeWrapper { ConcreteDataType::String(_) => ColumnDataType::String, ConcreteDataType::Date(_) => ColumnDataType::Date, ConcreteDataType::DateTime(_) => ColumnDataType::Datetime, - ConcreteDataType::Timestamp(_) => ColumnDataType::Timestamp, + ConcreteDataType::Timestamp(unit) => match unit { + TimestampType::Second(_) => ColumnDataType::TimestampSecond, + TimestampType::Millisecond(_) => ColumnDataType::TimestampMillisecond, + TimestampType::Microsecond(_) => ColumnDataType::TimestampMicrosecond, + TimestampType::Nanosecond(_) => ColumnDataType::TimestampNanosecond, + }, ConcreteDataType::Null(_) | ConcreteDataType::List(_) => { return error::IntoColumnDataTypeSnafu { from: datatype }.fail() } @@ -153,8 +168,20 @@ impl Values { datetime_values: Vec::with_capacity(capacity), ..Default::default() }, - ColumnDataType::Timestamp => Values { - ts_millis_values: Vec::with_capacity(capacity), + ColumnDataType::TimestampSecond => Values { + ts_second_values: Vec::with_capacity(capacity), + ..Default::default() + }, + ColumnDataType::TimestampMillisecond => Values { + ts_millisecond_values: Vec::with_capacity(capacity), + ..Default::default() + }, + ColumnDataType::TimestampMicrosecond => Values { + ts_microsecond_values: Vec::with_capacity(capacity), + ..Default::default() + }, + ColumnDataType::TimestampNanosecond => Values { + ts_nanosecond_values: Vec::with_capacity(capacity), ..Default::default() }, } @@ -187,9 +214,12 @@ impl Column { Value::Binary(val) => values.binary_values.push(val.to_vec()), Value::Date(val) => values.date_values.push(val.val()), Value::DateTime(val) => values.datetime_values.push(val.val()), - Value::Timestamp(val) => values - .ts_millis_values - .push(val.convert_to(TimeUnit::Millisecond)), + Value::Timestamp(val) => match val.unit() { + TimeUnit::Second => values.ts_second_values.push(val.value()), + TimeUnit::Millisecond => values.ts_millisecond_values.push(val.value()), + TimeUnit::Microsecond => values.ts_microsecond_values.push(val.value()), + TimeUnit::Nanosecond => values.ts_nanosecond_values.push(val.value()), + }, Value::List(_) => unreachable!(), }); self.null_mask = null_mask.into_vec(); @@ -200,7 +230,10 @@ impl Column { mod tests { use std::sync::Arc; - use datatypes::vectors::BooleanVector; + use datatypes::vectors::{ + BooleanVector, TimestampMicrosecondVector, TimestampMillisecondVector, + TimestampNanosecondVector, TimestampSecondVector, + }; use super::*; @@ -258,8 +291,8 @@ mod tests { let values = values.datetime_values; assert_eq!(2, values.capacity()); - let values = Values::with_capacity(ColumnDataType::Timestamp, 2); - let values = values.ts_millis_values; + let values = Values::with_capacity(ColumnDataType::TimestampMillisecond, 2); + let values = values.ts_millisecond_values; assert_eq!(2, values.capacity()); } @@ -326,8 +359,8 @@ mod tests { ColumnDataTypeWrapper(ColumnDataType::Datetime).into() ); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - ColumnDataTypeWrapper(ColumnDataType::Timestamp).into() + ConcreteDataType::timestamp_millisecond_datatype(), + ColumnDataTypeWrapper(ColumnDataType::TimestampMillisecond).into() ); } @@ -394,8 +427,8 @@ mod tests { ConcreteDataType::datetime_datatype().try_into().unwrap() ); assert_eq!( - ColumnDataTypeWrapper(ColumnDataType::Timestamp), - ConcreteDataType::timestamp_millis_datatype() + ColumnDataTypeWrapper(ColumnDataType::TimestampMillisecond), + ConcreteDataType::timestamp_millisecond_datatype() .try_into() .unwrap() ); @@ -412,7 +445,48 @@ mod tests { assert!(result.is_err()); assert_eq!( result.unwrap_err().to_string(), - "Failed to create column datatype from List(ListType { inner: Boolean(BooleanType) })" + "Failed to create column datatype from List(ListType { item_type: Boolean(BooleanType) })" + ); + } + + #[test] + fn test_column_put_timestamp_values() { + let mut column = Column { + column_name: "test".to_string(), + semantic_type: 0, + values: Some(Values { + ..Default::default() + }), + null_mask: vec![], + datatype: 0, + }; + + let vector = Arc::new(TimestampNanosecondVector::from_vec(vec![1, 2, 3])); + column.push_vals(3, vector); + assert_eq!( + vec![1, 2, 3], + column.values.as_ref().unwrap().ts_nanosecond_values + ); + + let vector = Arc::new(TimestampMillisecondVector::from_vec(vec![4, 5, 6])); + column.push_vals(3, vector); + assert_eq!( + vec![4, 5, 6], + column.values.as_ref().unwrap().ts_millisecond_values + ); + + let vector = Arc::new(TimestampMicrosecondVector::from_vec(vec![7, 8, 9])); + column.push_vals(3, vector); + assert_eq!( + vec![7, 8, 9], + column.values.as_ref().unwrap().ts_microsecond_values + ); + + let vector = Arc::new(TimestampSecondVector::from_vec(vec![10, 11, 12])); + column.push_vals(3, vector); + assert_eq!( + vec![10, 11, 12], + column.values.as_ref().unwrap().ts_second_values ); } diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index 1c6f7a063e..90adcf8e8a 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -19,9 +19,7 @@ common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" futures-util = "0.3" diff --git a/src/catalog/src/error.rs b/src/catalog/src/error.rs index 05e6944cd5..f344ae3bb8 100644 --- a/src/catalog/src/error.rs +++ b/src/catalog/src/error.rs @@ -17,7 +17,7 @@ use std::any::Any; use common_error::ext::{BoxedError, ErrorExt}; use common_error::prelude::{Snafu, StatusCode}; use datafusion::error::DataFusionError; -use datatypes::arrow; +use datatypes::prelude::ConcreteDataType; use datatypes::schema::RawSchema; use snafu::{Backtrace, ErrorCompat}; @@ -51,14 +51,12 @@ pub enum Error { SystemCatalog { msg: String, backtrace: Backtrace }, #[snafu(display( - "System catalog table type mismatch, expected: binary, found: {:?} source: {}", + "System catalog table type mismatch, expected: binary, found: {:?}", data_type, - source ))] SystemCatalogTypeMismatch { - data_type: arrow::datatypes::DataType, - #[snafu(backtrace)] - source: datatypes::error::Error, + data_type: ConcreteDataType, + backtrace: Backtrace, }, #[snafu(display("Invalid system catalog entry type: {:?}", entry_type))] @@ -222,10 +220,11 @@ impl ErrorExt for Error { | Error::ValueDeserialize { .. } | Error::Io { .. } => StatusCode::StorageUnavailable, - Error::RegisterTable { .. } => StatusCode::Internal, + Error::RegisterTable { .. } | Error::SystemCatalogTypeMismatch { .. } => { + StatusCode::Internal + } Error::ReadSystemCatalog { source, .. } => source.status_code(), - Error::SystemCatalogTypeMismatch { source, .. } => source.status_code(), Error::InvalidCatalogValue { source, .. } => source.status_code(), Error::TableExists { .. } => StatusCode::TableAlreadyExists, @@ -265,7 +264,6 @@ impl From for DataFusionError { #[cfg(test)] mod tests { use common_error::mock::MockError; - use datatypes::arrow::datatypes::DataType; use snafu::GenerateImplicitData; use super::*; @@ -314,11 +312,8 @@ mod tests { assert_eq!( StatusCode::Internal, Error::SystemCatalogTypeMismatch { - data_type: DataType::Boolean, - source: datatypes::error::Error::UnsupportedArrowType { - arrow_type: DataType::Boolean, - backtrace: Backtrace::generate() - } + data_type: ConcreteDataType::binary_datatype(), + backtrace: Backtrace::generate(), } .status_code() ); diff --git a/src/catalog/src/helper.rs b/src/catalog/src/helper.rs index 2caf098865..062d07bc19 100644 --- a/src/catalog/src/helper.rs +++ b/src/catalog/src/helper.rs @@ -138,7 +138,7 @@ impl TableGlobalKey { /// Table global info contains necessary info for a datanode to create table regions, including /// table id, table meta(schema...), region id allocation across datanodes. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct TableGlobalValue { /// Id of datanode that created the global table info kv. only for debugging. pub node_id: u64, diff --git a/src/catalog/src/local/manager.rs b/src/catalog/src/local/manager.rs index d09411cbaa..e4c89933e0 100644 --- a/src/catalog/src/local/manager.rs +++ b/src/catalog/src/local/manager.rs @@ -145,27 +145,34 @@ impl LocalCatalogManager { /// Convert `RecordBatch` to a vector of `Entry`. fn record_batch_to_entry(rb: RecordBatch) -> Result> { ensure!( - rb.df_recordbatch.columns().len() >= 6, + rb.num_columns() >= 6, SystemCatalogSnafu { - msg: format!("Length mismatch: {}", rb.df_recordbatch.columns().len()) + msg: format!("Length mismatch: {}", rb.num_columns()) } ); - let entry_type = UInt8Vector::try_from_arrow_array(&rb.df_recordbatch.columns()[0]) - .with_context(|_| SystemCatalogTypeMismatchSnafu { - data_type: rb.df_recordbatch.columns()[ENTRY_TYPE_INDEX] - .data_type() - .clone(), + let entry_type = rb + .column(ENTRY_TYPE_INDEX) + .as_any() + .downcast_ref::() + .with_context(|| SystemCatalogTypeMismatchSnafu { + data_type: rb.column(ENTRY_TYPE_INDEX).data_type(), })?; - let key = BinaryVector::try_from_arrow_array(&rb.df_recordbatch.columns()[1]) - .with_context(|_| SystemCatalogTypeMismatchSnafu { - data_type: rb.df_recordbatch.columns()[KEY_INDEX].data_type().clone(), + let key = rb + .column(KEY_INDEX) + .as_any() + .downcast_ref::() + .with_context(|| SystemCatalogTypeMismatchSnafu { + data_type: rb.column(KEY_INDEX).data_type(), })?; - let value = BinaryVector::try_from_arrow_array(&rb.df_recordbatch.columns()[3]) - .with_context(|_| SystemCatalogTypeMismatchSnafu { - data_type: rb.df_recordbatch.columns()[VALUE_INDEX].data_type().clone(), + let value = rb + .column(VALUE_INDEX) + .as_any() + .downcast_ref::() + .with_context(|| SystemCatalogTypeMismatchSnafu { + data_type: rb.column(VALUE_INDEX).data_type(), })?; let mut res = Vec::with_capacity(rb.num_rows()); diff --git a/src/catalog/src/system.rs b/src/catalog/src/system.rs index b6555b9353..960be1fa24 100644 --- a/src/catalog/src/system.rs +++ b/src/catalog/src/system.rs @@ -21,14 +21,13 @@ use common_catalog::consts::{ SYSTEM_CATALOG_TABLE_ID, SYSTEM_CATALOG_TABLE_NAME, }; use common_query::logical_plan::Expr; -use common_query::physical_plan::{PhysicalPlanRef, RuntimeEnv}; +use common_query::physical_plan::{PhysicalPlanRef, SessionContext}; use common_recordbatch::SendableRecordBatchStream; use common_telemetry::debug; -use common_time::timestamp::Timestamp; use common_time::util; use datatypes::prelude::{ConcreteDataType, ScalarVector}; use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef}; -use datatypes::vectors::{BinaryVector, TimestampVector, UInt8Vector}; +use datatypes::vectors::{BinaryVector, TimestampMillisecondVector, UInt8Vector}; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt}; use table::engine::{EngineContext, TableEngineRef}; @@ -127,13 +126,14 @@ impl SystemCatalogTable { /// Create a stream of all entries inside system catalog table pub async fn records(&self) -> Result { let full_projection = None; + let ctx = SessionContext::new(); let scan = self .table .scan(&full_projection, &[], None) .await .context(error::SystemCatalogTableScanSnafu)?; let stream = scan - .execute(0, Arc::new(RuntimeEnv::default())) + .execute(0, ctx.task_ctx()) .context(error::SystemCatalogTableScanExecSnafu)?; Ok(stream) } @@ -161,7 +161,7 @@ fn build_system_catalog_schema() -> Schema { ), ColumnSchema::new( "timestamp".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ) .with_time_index(true), @@ -172,12 +172,12 @@ fn build_system_catalog_schema() -> Schema { ), ColumnSchema::new( "gmt_created".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ColumnSchema::new( "gmt_modified".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ]; @@ -222,7 +222,7 @@ pub fn build_insert_request(entry_type: EntryType, key: &[u8], value: &[u8]) -> // Timestamp in key part is intentionally left to 0 columns_values.insert( "timestamp".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis(0)])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[0])) as _, ); columns_values.insert( @@ -230,18 +230,15 @@ pub fn build_insert_request(entry_type: EntryType, key: &[u8], value: &[u8]) -> Arc::new(BinaryVector::from_slice(&[value])) as _, ); + let now = util::current_time_millis(); columns_values.insert( "gmt_created".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); columns_values.insert( "gmt_modified".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); InsertRequest { diff --git a/src/catalog/src/tables.rs b/src/catalog/src/tables.rs index b11fc870de..8dd59fb1bf 100644 --- a/src/catalog/src/tables.rs +++ b/src/catalog/src/tables.rs @@ -26,9 +26,9 @@ use common_query::logical_plan::Expr; use common_query::physical_plan::PhysicalPlanRef; use common_recordbatch::error::Result as RecordBatchResult; use common_recordbatch::{RecordBatch, RecordBatchStream}; -use datatypes::prelude::{ConcreteDataType, VectorBuilder}; +use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; -use datatypes::value::Value; +use datatypes::value::ValueRef; use datatypes::vectors::VectorRef; use futures::Stream; use snafu::ResultExt; @@ -149,26 +149,33 @@ fn tables_to_record_batch( engine: &str, ) -> Vec { let mut catalog_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); let mut schema_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); let mut table_name_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); let mut engine_vec = - VectorBuilder::with_capacity(ConcreteDataType::string_datatype(), table_names.len()); + ConcreteDataType::string_datatype().create_mutable_vector(table_names.len()); for table_name in table_names { - catalog_vec.push(&Value::String(catalog_name.into())); - schema_vec.push(&Value::String(schema_name.into())); - table_name_vec.push(&Value::String(table_name.into())); - engine_vec.push(&Value::String(engine.into())); + // Safety: All these vectors are string type. + catalog_vec + .push_value_ref(ValueRef::String(catalog_name)) + .unwrap(); + schema_vec + .push_value_ref(ValueRef::String(schema_name)) + .unwrap(); + table_name_vec + .push_value_ref(ValueRef::String(&table_name)) + .unwrap(); + engine_vec.push_value_ref(ValueRef::String(engine)).unwrap(); } vec![ - catalog_vec.finish(), - schema_vec.finish(), - table_name_vec.finish(), - engine_vec.finish(), + catalog_vec.to_vector(), + schema_vec.to_vector(), + table_name_vec.to_vector(), + engine_vec.to_vector(), ] } @@ -340,9 +347,7 @@ fn build_schema_for_tables() -> Schema { #[cfg(test)] mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; - use common_query::physical_plan::RuntimeEnv; - use datatypes::arrow::array::Utf8Array; - use datatypes::arrow::datatypes::DataType; + use common_query::physical_plan::SessionContext; use futures_util::StreamExt; use table::table::numbers::NumbersTable; @@ -366,56 +371,47 @@ mod tests { let tables = Tables::new(catalog_list, "test_engine".to_string()); let tables_stream = tables.scan(&None, &[], None).await.unwrap(); - let mut tables_stream = tables_stream - .execute(0, Arc::new(RuntimeEnv::default())) - .unwrap(); + let session_ctx = SessionContext::new(); + let mut tables_stream = tables_stream.execute(0, session_ctx.task_ctx()).unwrap(); if let Some(t) = tables_stream.next().await { - let batch = t.unwrap().df_recordbatch; + let batch = t.unwrap(); assert_eq!(1, batch.num_rows()); assert_eq!(4, batch.num_columns()); - assert_eq!(&DataType::Utf8, batch.column(0).data_type()); - assert_eq!(&DataType::Utf8, batch.column(1).data_type()); - assert_eq!(&DataType::Utf8, batch.column(2).data_type()); - assert_eq!(&DataType::Utf8, batch.column(3).data_type()); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(0).data_type() + ); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(1).data_type() + ); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(2).data_type() + ); + assert_eq!( + ConcreteDataType::string_datatype(), + batch.column(3).data_type() + ); assert_eq!( "greptime", - batch - .column(0) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(0).get_ref(0).as_string().unwrap().unwrap() ); assert_eq!( "public", - batch - .column(1) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(1).get_ref(0).as_string().unwrap().unwrap() ); assert_eq!( "test_table", - batch - .column(2) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(2).get_ref(0).as_string().unwrap().unwrap() ); assert_eq!( "test_engine", - batch - .column(3) - .as_any() - .downcast_ref::>() - .unwrap() - .value(0) + batch.column(3).get_ref(0).as_string().unwrap().unwrap() ); } else { panic!("Record batch should not be empty!") diff --git a/src/client/Cargo.toml b/src/client/Cargo.toml index da58e9c884..5c19f89970 100644 --- a/src/client/Cargo.toml +++ b/src/client/Cargo.toml @@ -15,9 +15,7 @@ common-grpc-expr = { path = "../common/grpc-expr" } common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../datatypes" } enum_dispatch = "0.3" parking_lot = "0.12" diff --git a/src/client/examples/logical.rs b/src/client/examples/logical.rs index 9e00269f2f..9ea6cdc42f 100644 --- a/src/client/examples/logical.rs +++ b/src/client/examples/logical.rs @@ -41,7 +41,7 @@ async fn run() { column_defs: vec![ ColumnDef { name: "timestamp".to_string(), - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, is_nullable: false, default_constraint: None, }, diff --git a/src/client/src/database.rs b/src/client/src/database.rs index 54ab889bf5..2dcc62569b 100644 --- a/src/client/src/database.rs +++ b/src/client/src/database.rs @@ -318,12 +318,11 @@ mod tests { fn create_test_column(vector: VectorRef) -> Column { let wrapper: ColumnDataTypeWrapper = vector.data_type().try_into().unwrap(); - let array = vector.to_arrow_array(); Column { column_name: "test".to_string(), semantic_type: 1, - values: Some(values(&[array.clone()]).unwrap()), - null_mask: null_mask(&vec![array], vector.len()), + values: Some(values(&[vector.clone()]).unwrap()), + null_mask: null_mask(&[vector.clone()], vector.len()), datatype: wrapper.datatype() as i32, } } diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 3b98332b33..6bea05ce67 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anymap::AnyMap; use clap::Parser; use frontend::frontend::{Frontend, FrontendOptions}; @@ -138,14 +136,14 @@ impl TryFrom for FrontendOptions { if let Some(addr) = cmd.mysql_addr { opts.mysql_options = Some(MysqlOptions { addr, - tls: Arc::new(tls_option.clone()), + tls: tls_option.clone(), ..Default::default() }); } if let Some(addr) = cmd.postgres_addr { opts.postgres_options = Some(PostgresOptions { addr, - tls: Arc::new(tls_option), + tls: tls_option, ..Default::default() }); } diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 42f1e0a71e..d4b65c3a85 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anymap::AnyMap; use clap::Parser; use common_telemetry::info; @@ -262,12 +260,12 @@ impl TryFrom for FrontendOptions { let tls_option = TlsOption::new(cmd.tls_mode, cmd.tls_cert_path, cmd.tls_key_path); if let Some(mut mysql_options) = opts.mysql_options { - mysql_options.tls = Arc::new(tls_option.clone()); + mysql_options.tls = tls_option.clone(); opts.mysql_options = Some(mysql_options); } if let Some(mut postgres_options) = opts.postgres_options { - postgres_options.tls = Arc::new(tls_option); + postgres_options.tls = tls_option; opts.postgres_options = Some(postgres_options); } diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index b14738fe94..ce49cb5e5b 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -11,7 +11,7 @@ common-error = { path = "../error" } common-function-macro = { path = "../function-macro" } common-query = { path = "../query" } common-time = { path = "../time" } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-common = "14.0.0" datatypes = { path = "../../datatypes" } libc = "0.2" num = "0.4" diff --git a/src/common/function/src/error.rs b/src/common/function/src/error.rs deleted file mode 100644 index 73c3928a00..0000000000 --- a/src/common/function/src/error.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use common_error::prelude::*; -pub use common_query::error::{Error, Result}; -use datatypes::error::Error as DataTypeError; - -#[derive(Debug, Snafu)] -#[snafu(visibility(pub))] -pub enum InnerError { - #[snafu(display("Fail to get scalar vector, {}", source))] - GetScalarVector { - source: DataTypeError, - backtrace: Backtrace, - }, -} - -impl ErrorExt for InnerError { - fn backtrace_opt(&self) -> Option<&Backtrace> { - ErrorCompat::backtrace(self) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -impl From for Error { - fn from(err: InnerError) -> Self { - Self::new(err) - } -} - -#[cfg(test)] -mod tests { - use snafu::GenerateImplicitData; - - use super::*; - - fn raise_datatype_error() -> std::result::Result<(), DataTypeError> { - Err(DataTypeError::Conversion { - from: "test".to_string(), - backtrace: Backtrace::generate(), - }) - } - - #[test] - fn test_get_scalar_vector_error() { - let err: Error = raise_datatype_error() - .context(GetScalarVectorSnafu) - .err() - .unwrap() - .into(); - assert!(err.backtrace_opt().is_some()); - } -} diff --git a/src/common/function/src/lib.rs b/src/common/function/src/lib.rs index 5a1b8edacb..8d15fe0b25 100644 --- a/src/common/function/src/lib.rs +++ b/src/common/function/src/lib.rs @@ -12,5 +12,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod error; pub mod scalars; diff --git a/src/common/function/src/scalars.rs b/src/common/function/src/scalars.rs index d362ea5f89..e9499b2151 100644 --- a/src/common/function/src/scalars.rs +++ b/src/common/function/src/scalars.rs @@ -23,6 +23,5 @@ pub(crate) mod test; mod timestamp; pub mod udf; -pub use aggregate::MedianAccumulatorCreator; pub use function::{Function, FunctionRef}; pub use function_registry::{FunctionRegistry, FUNCTION_REGISTRY}; diff --git a/src/common/function/src/scalars/aggregate/mod.rs b/src/common/function/src/scalars/aggregate.rs similarity index 96% rename from src/common/function/src/scalars/aggregate/mod.rs rename to src/common/function/src/scalars/aggregate.rs index 8a4712a1b8..f605fff2f2 100644 --- a/src/common/function/src/scalars/aggregate/mod.rs +++ b/src/common/function/src/scalars/aggregate.rs @@ -16,7 +16,6 @@ mod argmax; mod argmin; mod diff; mod mean; -mod median; mod percentile; mod polyval; mod scipy_stats_norm_cdf; @@ -29,7 +28,6 @@ pub use argmin::ArgminAccumulatorCreator; use common_query::logical_plan::AggregateFunctionCreatorRef; pub use diff::DiffAccumulatorCreator; pub use mean::MeanAccumulatorCreator; -pub use median::MedianAccumulatorCreator; pub use percentile::PercentileAccumulatorCreator; pub use polyval::PolyvalAccumulatorCreator; pub use scipy_stats_norm_cdf::ScipyStatsNormCdfAccumulatorCreator; @@ -88,7 +86,6 @@ impl AggregateFunctions { }; } - register_aggr_func!("median", 1, MedianAccumulatorCreator); register_aggr_func!("diff", 1, DiffAccumulatorCreator); register_aggr_func!("mean", 1, MeanAccumulatorCreator); register_aggr_func!("polyval", 2, PolyvalAccumulatorCreator); diff --git a/src/common/function/src/scalars/aggregate/argmax.rs b/src/common/function/src/scalars/aggregate/argmax.rs index 0b63a766bd..d42d4550c6 100644 --- a/src/common/function/src/scalars/aggregate/argmax.rs +++ b/src/common/function/src/scalars/aggregate/argmax.rs @@ -20,24 +20,22 @@ use common_query::error::{BadAccumulatorImplSnafu, CreateAccumulatorSnafu, Resul use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::vectors::ConstantVector; +use datatypes::types::{LogicalPrimitiveType, WrapperType}; +use datatypes::vectors::{ConstantVector, Helper}; use datatypes::with_match_primitive_type_id; use snafu::ensure; // https://numpy.org/doc/stable/reference/generated/numpy.argmax.html // return the index of the max value #[derive(Debug, Default)] -pub struct Argmax -where - T: Primitive + PartialOrd, -{ +pub struct Argmax { max: Option, n: u64, } impl Argmax where - T: Primitive + PartialOrd, + T: PartialOrd + Copy, { fn update(&mut self, value: T, index: u64) { if let Some(Ordering::Less) = self.max.partial_cmp(&Some(value)) { @@ -49,8 +47,7 @@ where impl Accumulator for Argmax where - T: Primitive + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { fn state(&self) -> Result> { match self.max { @@ -66,10 +63,10 @@ where let column = &values[0]; let column: &::VectorType = if column.is_const() { - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; for (i, v) in column.iter_data().enumerate() { if let Some(value) = v { @@ -93,8 +90,8 @@ where let max = &states[0]; let index = &states[1]; - let max: &::VectorType = unsafe { VectorHelper::static_cast(max) }; - let index: &::VectorType = unsafe { VectorHelper::static_cast(index) }; + let max: &::VectorType = unsafe { Helper::static_cast(max) }; + let index: &::VectorType = unsafe { Helper::static_cast(index) }; index .iter_data() .flatten() @@ -122,7 +119,7 @@ impl AggregateFunctionCreator for ArgmaxAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Argmax::<$S>::default())) + Ok(Box::new(Argmax::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -154,7 +151,7 @@ impl AggregateFunctionCreator for ArgmaxAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -166,21 +163,19 @@ mod test { // test update one not-null value let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(argmax.update_batch(&v).is_ok()); assert_eq!(Value::from(0_u64), argmax.evaluate().unwrap()); // test update one null value let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(argmax.update_batch(&v).is_ok()); assert_eq!(Value::Null, argmax.evaluate().unwrap()); // test update no null-value batch let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(3), @@ -190,7 +185,7 @@ mod test { // test update null-value batch let mut argmax = Argmax::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(4), @@ -201,7 +196,7 @@ mod test { // test update with constant vector let mut argmax = Argmax::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 10, ))]; assert!(argmax.update_batch(&v).is_ok()); diff --git a/src/common/function/src/scalars/aggregate/argmin.rs b/src/common/function/src/scalars/aggregate/argmin.rs index bcbd6571c5..5b93561286 100644 --- a/src/common/function/src/scalars/aggregate/argmin.rs +++ b/src/common/function/src/scalars/aggregate/argmin.rs @@ -20,23 +20,20 @@ use common_query::error::{BadAccumulatorImplSnafu, CreateAccumulatorSnafu, Resul use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::vectors::ConstantVector; +use datatypes::vectors::{ConstantVector, Helper}; use datatypes::with_match_primitive_type_id; use snafu::ensure; // // https://numpy.org/doc/stable/reference/generated/numpy.argmin.html #[derive(Debug, Default)] -pub struct Argmin -where - T: Primitive + PartialOrd, -{ +pub struct Argmin { min: Option, n: u32, } impl Argmin where - T: Primitive + PartialOrd, + T: Copy + PartialOrd, { fn update(&mut self, value: T, index: u32) { match self.min { @@ -56,8 +53,7 @@ where impl Accumulator for Argmin where - T: Primitive + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { fn state(&self) -> Result> { match self.min { @@ -75,10 +71,10 @@ where let column = &values[0]; let column: &::VectorType = if column.is_const() { - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; for (i, v) in column.iter_data().enumerate() { if let Some(value) = v { @@ -102,8 +98,8 @@ where let min = &states[0]; let index = &states[1]; - let min: &::VectorType = unsafe { VectorHelper::static_cast(min) }; - let index: &::VectorType = unsafe { VectorHelper::static_cast(index) }; + let min: &::VectorType = unsafe { Helper::static_cast(min) }; + let index: &::VectorType = unsafe { Helper::static_cast(index) }; index .iter_data() .flatten() @@ -131,7 +127,7 @@ impl AggregateFunctionCreator for ArgminAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Argmin::<$S>::default())) + Ok(Box::new(Argmin::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -163,7 +159,7 @@ impl AggregateFunctionCreator for ArgminAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -175,21 +171,19 @@ mod test { // test update one not-null value let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(argmin.update_batch(&v).is_ok()); assert_eq!(Value::from(0_u32), argmin.evaluate().unwrap()); // test update one null value let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(argmin.update_batch(&v).is_ok()); assert_eq!(Value::Null, argmin.evaluate().unwrap()); // test update no null-value batch let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(3), @@ -199,7 +193,7 @@ mod test { // test update null-value batch let mut argmin = Argmin::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(4), @@ -210,7 +204,7 @@ mod test { // test update with constant vector let mut argmin = Argmin::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 10, ))]; assert!(argmin.update_batch(&v).is_ok()); diff --git a/src/common/function/src/scalars/aggregate/diff.rs b/src/common/function/src/scalars/aggregate/diff.rs index d0e7ca3406..3f7ecc2400 100644 --- a/src/common/function/src/scalars/aggregate/diff.rs +++ b/src/common/function/src/scalars/aggregate/diff.rs @@ -22,40 +22,32 @@ use common_query::error::{ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::types::PrimitiveType; use datatypes::value::ListValue; -use datatypes::vectors::{ConstantVector, ListVector}; +use datatypes::vectors::{ConstantVector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; // https://numpy.org/doc/stable/reference/generated/numpy.diff.html +// I is the input type, O is the output type. #[derive(Debug, Default)] -pub struct Diff -where - T: Primitive + AsPrimitive, - SubT: Primitive + std::ops::Sub, -{ - values: Vec, - _phantom: PhantomData, +pub struct Diff { + values: Vec, + _phantom: PhantomData, } -impl Diff -where - T: Primitive + AsPrimitive, - SubT: Primitive + std::ops::Sub, -{ - fn push(&mut self, value: T) { +impl Diff { + fn push(&mut self, value: I) { self.values.push(value); } } -impl Accumulator for Diff +impl Accumulator for Diff where - T: Primitive + AsPrimitive, - for<'a> T: Scalar = T>, - SubT: Primitive + std::ops::Sub, - for<'a> SubT: Scalar = SubT>, + I: WrapperType, + O: WrapperType, + I::Native: AsPrimitive, + O::Native: std::ops::Sub, { fn state(&self) -> Result> { let nums = self @@ -65,7 +57,7 @@ where .collect::>(); Ok(vec![Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + I::LogicalType::build_data_type(), ))]) } @@ -78,12 +70,12 @@ where let column = &values[0]; let mut len = 1; - let column: &::VectorType = if column.is_const() { + let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; (0..len).for_each(|_| { for v in column.iter_data().flatten() { @@ -109,8 +101,9 @@ where ), })?; for state in states.values_iter() { - let state = state.context(FromScalarValueSnafu)?; - self.update_batch(&[state])? + if let Some(state) = state.context(FromScalarValueSnafu)? { + self.update_batch(&[state])?; + } } Ok(()) } @@ -122,11 +115,14 @@ where let diff = self .values .windows(2) - .map(|x| (x[1].as_() - x[0].as_()).into()) + .map(|x| { + let native = x[1].into_native().as_() - x[0].into_native().as_(); + O::from_native(native).into() + }) .collect::>(); let diff = Value::List(ListValue::new( Some(Box::new(diff)), - SubT::default().into().data_type(), + O::LogicalType::build_data_type(), )); Ok(diff) } @@ -143,7 +139,7 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Diff::<$S,<$S as Primitive>::LargestType>::default())) + Ok(Box::new(Diff::<<$S as LogicalPrimitiveType>::Wrapper, <<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -163,7 +159,7 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { with_match_primitive_type_id!( input_types[0].logical_type_id(), |$S| { - Ok(ConcreteDataType::list_datatype(PrimitiveType::<<$S as Primitive>::LargestType>::default().into())) + Ok(ConcreteDataType::list_datatype($S::default().into())) }, { unreachable!() @@ -177,7 +173,7 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { with_match_primitive_type_id!( input_types[0].logical_type_id(), |$S| { - Ok(vec![ConcreteDataType::list_datatype(PrimitiveType::<$S>::default().into())]) + Ok(vec![ConcreteDataType::list_datatype($S::default().into())]) }, { unreachable!() @@ -188,9 +184,10 @@ impl AggregateFunctionCreator for DiffAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; + #[test] fn test_update_batch() { // test update empty batch, expect not updating anything @@ -201,21 +198,19 @@ mod test { // test update one not-null value let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(diff.update_batch(&v).is_ok()); assert_eq!(Value::Null, diff.evaluate().unwrap()); // test update one null value let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(diff.update_batch(&v).is_ok()); assert_eq!(Value::Null, diff.evaluate().unwrap()); // test update no null-value batch let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(2), @@ -232,7 +227,7 @@ mod test { // test update null-value batch let mut diff = Diff::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(3), @@ -251,7 +246,7 @@ mod test { // test update with constant vector let mut diff = Diff::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 4, ))]; let values = vec![Value::from(0_i64), Value::from(0_i64), Value::from(0_i64)]; diff --git a/src/common/function/src/scalars/aggregate/mean.rs b/src/common/function/src/scalars/aggregate/mean.rs index 2393a58cd2..ce619bb253 100644 --- a/src/common/function/src/scalars/aggregate/mean.rs +++ b/src/common/function/src/scalars/aggregate/mean.rs @@ -22,16 +22,14 @@ use common_query::error::{ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::vectors::{ConstantVector, Float64Vector, UInt64Vector}; +use datatypes::types::WrapperType; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, UInt64Vector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt}; #[derive(Debug, Default)] -pub struct Mean -where - T: Primitive + AsPrimitive, -{ +pub struct Mean { sum: f64, n: u64, _phantom: PhantomData, @@ -39,11 +37,12 @@ where impl Mean where - T: Primitive + AsPrimitive, + T: WrapperType, + T::Native: AsPrimitive, { #[inline(always)] fn push(&mut self, value: T) { - self.sum += value.as_(); + self.sum += value.into_native().as_(); self.n += 1; } @@ -56,8 +55,8 @@ where impl Accumulator for Mean where - T: Primitive + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType, + T::Native: AsPrimitive, { fn state(&self) -> Result> { Ok(vec![self.sum.into(), self.n.into()]) @@ -73,10 +72,10 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; (0..len).for_each(|_| { for v in column.iter_data().flatten() { @@ -150,7 +149,7 @@ impl AggregateFunctionCreator for MeanAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Mean::<$S>::default())) + Ok(Box::new(Mean::<<$S as LogicalPrimitiveType>::Native>::default())) }, { let err_msg = format!( @@ -182,7 +181,7 @@ impl AggregateFunctionCreator for MeanAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -194,21 +193,19 @@ mod test { // test update one not-null value let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Some(42)]))]; assert!(mean.update_batch(&v).is_ok()); assert_eq!(Value::from(42.0_f64), mean.evaluate().unwrap()); // test update one null value let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; + let v: Vec = vec![Arc::new(Int32Vector::from(vec![Option::::None]))]; assert!(mean.update_batch(&v).is_ok()); assert_eq!(Value::Null, mean.evaluate().unwrap()); // test update no null-value batch let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-1i32), Some(1), Some(2), @@ -218,7 +215,7 @@ mod test { // test update null-value batch let mut mean = Mean::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ + let v: Vec = vec![Arc::new(Int32Vector::from(vec![ Some(-2i32), None, Some(3), @@ -230,7 +227,7 @@ mod test { // test update with constant vector let mut mean = Mean::::default(); let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 10, ))]; assert!(mean.update_batch(&v).is_ok()); diff --git a/src/common/function/src/scalars/aggregate/median.rs b/src/common/function/src/scalars/aggregate/median.rs deleted file mode 100644 index 4c445c0fb9..0000000000 --- a/src/common/function/src/scalars/aggregate/median.rs +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::sync::Arc; - -use common_function_macro::{as_aggr_func_creator, AggrFuncTypeStore}; -use common_query::error::{ - CreateAccumulatorSnafu, DowncastVectorSnafu, FromScalarValueSnafu, Result, -}; -use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; -use common_query::prelude::*; -use datatypes::prelude::*; -use datatypes::types::OrdPrimitive; -use datatypes::value::ListValue; -use datatypes::vectors::{ConstantVector, ListVector}; -use datatypes::with_match_primitive_type_id; -use num::NumCast; -use snafu::{ensure, OptionExt, ResultExt}; - -// This median calculation algorithm's details can be found at -// https://leetcode.cn/problems/find-median-from-data-stream/ -// -// Basically, it uses two heaps, a maximum heap and a minimum. The maximum heap stores numbers that -// are not greater than the median, and the minimum heap stores the greater. In a streaming of -// numbers, when a number is arrived, we adjust the heaps' tops, so that either one top is the -// median or both tops can be averaged to get the median. -// -// The time complexity to update the median is O(logn), O(1) to get the median; and the space -// complexity is O(n). (Ignore the costs for heap expansion.) -// -// From the point of algorithm, [quick select](https://en.wikipedia.org/wiki/Quickselect) might be -// better. But to use quick select here, we need a mutable self in the final calculation(`evaluate`) -// to swap stored numbers in the states vector. Though we can make our `evaluate` received -// `&mut self`, DataFusion calls our accumulator with `&self` (see `DfAccumulatorAdaptor`). That -// means we have to introduce some kinds of interior mutability, and the overhead is not neglectable. -// -// TODO(LFC): Use quick select to get median when we can modify DataFusion's code, and benchmark with two-heap algorithm. -#[derive(Debug, Default)] -pub struct Median -where - T: Primitive, -{ - greater: BinaryHeap>>, - not_greater: BinaryHeap>, -} - -impl Median -where - T: Primitive, -{ - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - - if self.not_greater.is_empty() { - self.not_greater.push(value); - return; - } - // The `unwrap`s below are safe because there are `push`s before them. - if value <= *self.not_greater.peek().unwrap() { - self.not_greater.push(value); - if self.not_greater.len() > self.greater.len() + 1 { - self.greater.push(Reverse(self.not_greater.pop().unwrap())); - } - } else { - self.greater.push(Reverse(value)); - if self.greater.len() > self.not_greater.len() { - self.not_greater.push(self.greater.pop().unwrap().0); - } - } - } -} - -// UDAFs are built using the trait `Accumulator`, that offers DataFusion the necessary functions -// to use them. -impl Accumulator for Median -where - T: Primitive, - for<'a> T: Scalar = T>, -{ - // This function serializes our state to `ScalarValue`, which DataFusion uses to pass this - // state between execution stages. Note that this can be arbitrary data. - // - // The `ScalarValue`s returned here will be passed in as argument `states: &[VectorRef]` to - // `merge_batch` function. - fn state(&self) -> Result> { - let nums = self - .greater - .iter() - .map(|x| &x.0) - .chain(self.not_greater.iter()) - .map(|&n| n.into()) - .collect::>(); - Ok(vec![Value::List(ListValue::new( - Some(Box::new(nums)), - T::default().into().data_type(), - ))]) - } - - // DataFusion calls this function to update the accumulator's state for a batch of inputs rows. - // It is expected this function to update the accumulator's state. - fn update_batch(&mut self, values: &[VectorRef]) -> Result<()> { - if values.is_empty() { - return Ok(()); - } - - ensure!(values.len() == 1, InvalidInputStateSnafu); - - // This is a unary accumulator, so only one column is provided. - let column = &values[0]; - let mut len = 1; - let column: &::VectorType = if column.is_const() { - len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } - } else { - unsafe { VectorHelper::static_cast(column) } - }; - (0..len).for_each(|_| { - for v in column.iter_data().flatten() { - self.push(v); - } - }); - Ok(()) - } - - // DataFusion executes accumulators in partitions. In some execution stage, DataFusion will - // merge states from other accumulators (returned by `state()` method). - fn merge_batch(&mut self, states: &[VectorRef]) -> Result<()> { - if states.is_empty() { - return Ok(()); - } - - // The states here are returned by the `state` method. Since we only returned a vector - // with one value in that method, `states[0]` is fine. - let states = &states[0]; - let states = states - .as_any() - .downcast_ref::() - .with_context(|| DowncastVectorSnafu { - err_msg: format!( - "expect ListVector, got vector type {}", - states.vector_type_name() - ), - })?; - for state in states.values_iter() { - let state = state.context(FromScalarValueSnafu)?; - // merging state is simply accumulate stored numbers from others', so just call update - self.update_batch(&[state])? - } - Ok(()) - } - - // DataFusion expects this function to return the final value of this aggregator. - fn evaluate(&self) -> Result { - if self.not_greater.is_empty() { - assert!( - self.greater.is_empty(), - "not expected in two-heap median algorithm, there must be a bug when implementing it" - ); - return Ok(Value::Null); - } - - // unwrap is safe because we checked not_greater heap's len above - let not_greater = *self.not_greater.peek().unwrap(); - let median = if self.not_greater.len() > self.greater.len() { - not_greater.into() - } else { - // unwrap is safe because greater heap len >= not_greater heap len, which is > 0 - let greater = self.greater.peek().unwrap(); - - // the following three NumCast's `unwrap`s are safe because T is primitive - let not_greater_v: f64 = NumCast::from(not_greater.as_primitive()).unwrap(); - let greater_v: f64 = NumCast::from(greater.0.as_primitive()).unwrap(); - let median: T = NumCast::from((not_greater_v + greater_v) / 2.0).unwrap(); - median.into() - }; - Ok(median) - } -} - -#[as_aggr_func_creator] -#[derive(Debug, Default, AggrFuncTypeStore)] -pub struct MedianAccumulatorCreator {} - -impl AggregateFunctionCreator for MedianAccumulatorCreator { - fn creator(&self) -> AccumulatorCreatorFunction { - let creator: AccumulatorCreatorFunction = Arc::new(move |types: &[ConcreteDataType]| { - let input_type = &types[0]; - with_match_primitive_type_id!( - input_type.logical_type_id(), - |$S| { - Ok(Box::new(Median::<$S>::default())) - }, - { - let err_msg = format!( - "\"MEDIAN\" aggregate function not support data type {:?}", - input_type.logical_type_id(), - ); - CreateAccumulatorSnafu { err_msg }.fail()? - } - ) - }); - creator - } - - fn output_type(&self) -> Result { - let input_types = self.input_types()?; - ensure!(input_types.len() == 1, InvalidInputStateSnafu); - // unwrap is safe because we have checked input_types len must equals 1 - Ok(input_types.into_iter().next().unwrap()) - } - - fn state_types(&self) -> Result> { - Ok(vec![ConcreteDataType::list_datatype(self.output_type()?)]) - } -} - -#[cfg(test)] -mod test { - use datatypes::vectors::PrimitiveVector; - - use super::*; - #[test] - fn test_update_batch() { - // test update empty batch, expect not updating anything - let mut median = Median::::default(); - assert!(median.update_batch(&[]).is_ok()); - assert!(median.not_greater.is_empty()); - assert!(median.greater.is_empty()); - assert_eq!(Value::Null, median.evaluate().unwrap()); - - // test update one not-null value - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![Some(42)]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(42), median.evaluate().unwrap()); - - // test update one null value - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Option::::None, - ]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Null, median.evaluate().unwrap()); - - // test update no null-value batch - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(1), median.evaluate().unwrap()); - - // test update null-value batch - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ]))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(3), median.evaluate().unwrap()); - - // test update with constant vector - let mut median = Median::::default(); - let v: Vec = vec![Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), - 10, - ))]; - assert!(median.update_batch(&v).is_ok()); - assert_eq!(Value::Int32(4), median.evaluate().unwrap()); - } -} diff --git a/src/common/function/src/scalars/aggregate/percentile.rs b/src/common/function/src/scalars/aggregate/percentile.rs index 1b642dd274..1517f90e62 100644 --- a/src/common/function/src/scalars/aggregate/percentile.rs +++ b/src/common/function/src/scalars/aggregate/percentile.rs @@ -26,7 +26,7 @@ use common_query::prelude::*; use datatypes::prelude::*; use datatypes::types::OrdPrimitive; use datatypes::value::{ListValue, OrderedFloat}; -use datatypes::vectors::{ConstantVector, Float64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num::NumCast; use snafu::{ensure, OptionExt, ResultExt}; @@ -44,15 +44,15 @@ use snafu::{ensure, OptionExt, ResultExt}; // This optional method parameter specifies the method to use when the desired quantile lies between two data points i < j. // If g is the fractional part of the index surrounded by i and alpha and beta are correction constants modifying i and j. // i+g = (q-alpha)/(n-alpha-beta+1) -// Below, ‘q’ is the quantile value, ‘n’ is the sample size and alpha and beta are constants. The following formula gives an interpolation “i + g” of where the quantile would be in the sorted sample. -// With ‘i’ being the floor and ‘g’ the fractional part of the result. +// Below, 'q' is the quantile value, 'n' is the sample size and alpha and beta are constants. The following formula gives an interpolation "i + g" of where the quantile would be in the sorted sample. +// With 'i' being the floor and 'g' the fractional part of the result. // the default method is linear where // alpha = 1 // beta = 1 #[derive(Debug, Default)] pub struct Percentile where - T: Primitive, + T: WrapperType, { greater: BinaryHeap>>, not_greater: BinaryHeap>, @@ -62,7 +62,7 @@ where impl Percentile where - T: Primitive, + T: WrapperType, { fn push(&mut self, value: T) { let value = OrdPrimitive::(value); @@ -93,8 +93,7 @@ where impl Accumulator for Percentile where - T: Primitive, - for<'a> T: Scalar = T>, + T: WrapperType, { fn state(&self) -> Result> { let nums = self @@ -107,7 +106,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.p.into(), ]) @@ -129,14 +128,14 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"POLYVAL\" function's second argument to be float64", })?; // `get(0)` is safe because we have checked `values[1].len() == values[0].len() != 0` @@ -209,10 +208,11 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } Ok(()) @@ -259,7 +259,7 @@ impl AggregateFunctionCreator for PercentileAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Percentile::<$S>::default())) + Ok(Box::new(Percentile::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -292,7 +292,7 @@ impl AggregateFunctionCreator for PercentileAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::{Float64Vector, Int32Vector}; use super::*; #[test] @@ -307,8 +307,8 @@ mod test { // test update one not-null value let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Some(42)])), - Arc::new(PrimitiveVector::::from(vec![Some(100.0_f64)])), + Arc::new(Int32Vector::from(vec![Some(42)])), + Arc::new(Float64Vector::from(vec![Some(100.0_f64)])), ]; assert!(percentile.update_batch(&v).is_ok()); assert_eq!(Value::from(42.0_f64), percentile.evaluate().unwrap()); @@ -316,8 +316,8 @@ mod test { // test update one null value let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Option::::None])), - Arc::new(PrimitiveVector::::from(vec![Some(100.0_f64)])), + Arc::new(Int32Vector::from(vec![Option::::None])), + Arc::new(Float64Vector::from(vec![Some(100.0_f64)])), ]; assert!(percentile.update_batch(&v).is_ok()); assert_eq!(Value::Null, percentile.evaluate().unwrap()); @@ -325,12 +325,8 @@ mod test { // test update no null-value batch let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(100.0_f64), Some(100.0_f64), Some(100.0_f64), @@ -342,13 +338,8 @@ mod test { // test update null-value batch let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-2i32), None, Some(3), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(100.0_f64), Some(100.0_f64), Some(100.0_f64), @@ -362,13 +353,10 @@ mod test { let mut percentile = Percentile::::default(); let v: Vec = vec![ Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 2, )), - Arc::new(PrimitiveVector::::from(vec![ - Some(100.0_f64), - Some(100.0_f64), - ])), + Arc::new(Float64Vector::from(vec![Some(100.0_f64), Some(100.0_f64)])), ]; assert!(percentile.update_batch(&v).is_ok()); assert_eq!(Value::from(4_f64), percentile.evaluate().unwrap()); @@ -376,12 +364,8 @@ mod test { // test left border let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(0.0_f64), Some(0.0_f64), Some(0.0_f64), @@ -393,12 +377,8 @@ mod test { // test medium let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(50.0_f64), Some(50.0_f64), Some(50.0_f64), @@ -410,12 +390,8 @@ mod test { // test right border let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(100.0_f64), Some(100.0_f64), Some(100.0_f64), @@ -431,12 +407,8 @@ mod test { // >> 6.400000000000 let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(10i32), - Some(7), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(10i32), Some(7), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(40.0_f64), Some(40.0_f64), Some(40.0_f64), @@ -451,12 +423,8 @@ mod test { // >> 9.7000000000000011 let mut percentile = Percentile::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(10i32), - Some(7), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(10i32), Some(7), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(95.0_f64), Some(95.0_f64), Some(95.0_f64), diff --git a/src/common/function/src/scalars/aggregate/polyval.rs b/src/common/function/src/scalars/aggregate/polyval.rs index 75a9d809f7..0a8fc818c5 100644 --- a/src/common/function/src/scalars/aggregate/polyval.rs +++ b/src/common/function/src/scalars/aggregate/polyval.rs @@ -23,9 +23,9 @@ use common_query::error::{ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; -use datatypes::types::PrimitiveType; +use datatypes::types::{LogicalPrimitiveType, WrapperType}; use datatypes::value::ListValue; -use datatypes::vectors::{ConstantVector, Int64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Helper, Int64Vector, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; @@ -34,8 +34,10 @@ use snafu::{ensure, OptionExt, ResultExt}; #[derive(Debug, Default)] pub struct Polyval where - T: Primitive + AsPrimitive, - PolyT: Primitive + std::ops::Mul, + T: WrapperType, + T::Native: AsPrimitive, + PolyT: WrapperType, + PolyT::Native: std::ops::Mul, { values: Vec, // DataFusion casts constant in into i64 type. @@ -45,8 +47,10 @@ where impl Polyval where - T: Primitive + AsPrimitive, - PolyT: Primitive + std::ops::Mul, + T: WrapperType, + T::Native: AsPrimitive, + PolyT: WrapperType, + PolyT::Native: std::ops::Mul, { fn push(&mut self, value: T) { self.values.push(value); @@ -55,11 +59,11 @@ where impl Accumulator for Polyval where - T: Primitive + AsPrimitive, - PolyT: Primitive + std::ops::Mul + std::iter::Sum, - for<'a> T: Scalar = T>, - for<'a> PolyT: Scalar = PolyT>, - i64: AsPrimitive, + T: WrapperType, + T::Native: AsPrimitive, + PolyT: WrapperType + std::iter::Sum<::Native>, + PolyT::Native: std::ops::Mul + std::iter::Sum, + i64: AsPrimitive<::Native>, { fn state(&self) -> Result> { let nums = self @@ -70,7 +74,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.x.into(), ]) @@ -91,10 +95,10 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; (0..len).for_each(|_| { for v in column.iter_data().flatten() { @@ -103,7 +107,7 @@ where }); let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"POLYVAL\" function's second argument to be a positive integer", })?; // `get(0)` is safe because we have checked `values[1].len() == values[0].len() != 0` @@ -172,12 +176,14 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } + Ok(()) } @@ -196,7 +202,7 @@ where .values .iter() .enumerate() - .map(|(i, &value)| value.as_() * (x.pow((len - 1 - i) as u32)).as_()) + .map(|(i, &value)| value.into_native().as_() * x.pow((len - 1 - i) as u32).as_()) .sum(); Ok(polyval.into()) } @@ -213,7 +219,7 @@ impl AggregateFunctionCreator for PolyvalAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(Polyval::<$S,<$S as Primitive>::LargestType>::default())) + Ok(Box::new(Polyval::<<$S as LogicalPrimitiveType>::Wrapper, <<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -234,7 +240,7 @@ impl AggregateFunctionCreator for PolyvalAccumulatorCreator { with_match_primitive_type_id!( input_type, |$S| { - Ok(PrimitiveType::<<$S as Primitive>::LargestType>::default().into()) + Ok(<<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::build_data_type()) }, { unreachable!() @@ -254,7 +260,7 @@ impl AggregateFunctionCreator for PolyvalAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::Int32Vector; use super::*; #[test] @@ -268,8 +274,8 @@ mod test { // test update one not-null value let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Some(3)])), - Arc::new(PrimitiveVector::::from(vec![Some(2_i64)])), + Arc::new(Int32Vector::from(vec![Some(3)])), + Arc::new(Int64Vector::from(vec![Some(2_i64)])), ]; assert!(polyval.update_batch(&v).is_ok()); assert_eq!(Value::Int64(3), polyval.evaluate().unwrap()); @@ -277,8 +283,8 @@ mod test { // test update one null value let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![Option::::None])), - Arc::new(PrimitiveVector::::from(vec![Some(2_i64)])), + Arc::new(Int32Vector::from(vec![Option::::None])), + Arc::new(Int64Vector::from(vec![Some(2_i64)])), ]; assert!(polyval.update_batch(&v).is_ok()); assert_eq!(Value::Null, polyval.evaluate().unwrap()); @@ -286,12 +292,8 @@ mod test { // test update no null-value batch let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(3), - Some(0), - Some(1), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(3), Some(0), Some(1)])), + Arc::new(Int64Vector::from(vec![ Some(2_i64), Some(2_i64), Some(2_i64), @@ -303,13 +305,8 @@ mod test { // test update null-value batch let mut polyval = Polyval::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(3), - Some(0), - None, - Some(1), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(3), Some(0), None, Some(1)])), + Arc::new(Int64Vector::from(vec![ Some(2_i64), Some(2_i64), Some(2_i64), @@ -323,10 +320,10 @@ mod test { let mut polyval = Polyval::::default(); let v: Vec = vec![ Arc::new(ConstantVector::new( - Arc::new(PrimitiveVector::::from_vec(vec![4])), + Arc::new(Int32Vector::from_vec(vec![4])), 2, )), - Arc::new(PrimitiveVector::::from(vec![Some(5_i64), Some(5_i64)])), + Arc::new(Int64Vector::from(vec![Some(5_i64), Some(5_i64)])), ]; assert!(polyval.update_batch(&v).is_ok()); assert_eq!(Value::Int64(24), polyval.evaluate().unwrap()); diff --git a/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs b/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs index 8f43b64e92..caa07248a3 100644 --- a/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs +++ b/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs @@ -23,7 +23,7 @@ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; use datatypes::value::{ListValue, OrderedFloat}; -use datatypes::vectors::{ConstantVector, Float64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; @@ -33,18 +33,12 @@ use statrs::statistics::Statistics; // https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html #[derive(Debug, Default)] -pub struct ScipyStatsNormCdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +pub struct ScipyStatsNormCdf { values: Vec, x: Option, } -impl ScipyStatsNormCdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +impl ScipyStatsNormCdf { fn push(&mut self, value: T) { self.values.push(value); } @@ -52,8 +46,8 @@ where impl Accumulator for ScipyStatsNormCdf where - T: Primitive + AsPrimitive + std::iter::Sum, - for<'a> T: Scalar = T>, + T: WrapperType + std::iter::Sum, + T::Native: AsPrimitive, { fn state(&self) -> Result> { let nums = self @@ -64,7 +58,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.x.into(), ]) @@ -86,14 +80,14 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"SCIPYSTATSNORMCDF\" function's second argument to be a positive integer", })?; let first = x.get(0); @@ -160,19 +154,19 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } Ok(()) } fn evaluate(&self) -> Result { - let values = self.values.iter().map(|&v| v.as_()).collect::>(); - let mean = values.clone().mean(); - let std_dev = values.std_dev(); + let mean = self.values.iter().map(|v| v.into_native().as_()).mean(); + let std_dev = self.values.iter().map(|v| v.into_native().as_()).std_dev(); if mean.is_nan() || std_dev.is_nan() { Ok(Value::Null) } else { @@ -198,7 +192,7 @@ impl AggregateFunctionCreator for ScipyStatsNormCdfAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(ScipyStatsNormCdf::<$S>::default())) + Ok(Box::new(ScipyStatsNormCdf::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -230,7 +224,7 @@ impl AggregateFunctionCreator for ScipyStatsNormCdfAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::{Float64Vector, Int32Vector}; use super::*; #[test] @@ -244,12 +238,8 @@ mod test { // test update no null-value batch let mut scipy_stats_norm_cdf = ScipyStatsNormCdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), Some(2.0_f64), Some(2.0_f64), @@ -264,13 +254,8 @@ mod test { // test update null-value batch let mut scipy_stats_norm_cdf = ScipyStatsNormCdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-2i32), None, Some(3), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), None, Some(2.0_f64), diff --git a/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs b/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs index e381d11b54..186d59a890 100644 --- a/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs +++ b/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs @@ -23,7 +23,7 @@ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use datatypes::prelude::*; use datatypes::value::{ListValue, OrderedFloat}; -use datatypes::vectors::{ConstantVector, Float64Vector, ListVector}; +use datatypes::vectors::{ConstantVector, Float64Vector, Helper, ListVector}; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use snafu::{ensure, OptionExt, ResultExt}; @@ -33,18 +33,12 @@ use statrs::statistics::Statistics; // https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html #[derive(Debug, Default)] -pub struct ScipyStatsNormPdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +pub struct ScipyStatsNormPdf { values: Vec, x: Option, } -impl ScipyStatsNormPdf -where - T: Primitive + AsPrimitive + std::iter::Sum, -{ +impl ScipyStatsNormPdf { fn push(&mut self, value: T) { self.values.push(value); } @@ -52,8 +46,8 @@ where impl Accumulator for ScipyStatsNormPdf where - T: Primitive + AsPrimitive + std::iter::Sum, - for<'a> T: Scalar = T>, + T: WrapperType, + T::Native: AsPrimitive + std::iter::Sum, { fn state(&self) -> Result> { let nums = self @@ -64,7 +58,7 @@ where Ok(vec![ Value::List(ListValue::new( Some(Box::new(nums)), - T::default().into().data_type(), + T::LogicalType::build_data_type(), )), self.x.into(), ]) @@ -86,14 +80,14 @@ where let mut len = 1; let column: &::VectorType = if column.is_const() { len = column.len(); - let column: &ConstantVector = unsafe { VectorHelper::static_cast(column) }; - unsafe { VectorHelper::static_cast(column.inner()) } + let column: &ConstantVector = unsafe { Helper::static_cast(column) }; + unsafe { Helper::static_cast(column.inner()) } } else { - unsafe { VectorHelper::static_cast(column) } + unsafe { Helper::static_cast(column) } }; let x = &values[1]; - let x = VectorHelper::check_get_scalar::(x).context(error::InvalidInputsSnafu { + let x = Helper::check_get_scalar::(x).context(error::InvalidInputTypeSnafu { err_msg: "expecting \"SCIPYSTATSNORMPDF\" function's second argument to be a positive integer", })?; let first = x.get(0); @@ -160,19 +154,20 @@ where ), })?; for value in values.values_iter() { - let value = value.context(FromScalarValueSnafu)?; - let column: &::VectorType = unsafe { VectorHelper::static_cast(&value) }; - for v in column.iter_data().flatten() { - self.push(v); + if let Some(value) = value.context(FromScalarValueSnafu)? { + let column: &::VectorType = unsafe { Helper::static_cast(&value) }; + for v in column.iter_data().flatten() { + self.push(v); + } } } Ok(()) } fn evaluate(&self) -> Result { - let values = self.values.iter().map(|&v| v.as_()).collect::>(); - let mean = values.clone().mean(); - let std_dev = values.std_dev(); + let mean = self.values.iter().map(|v| v.into_native().as_()).mean(); + let std_dev = self.values.iter().map(|v| v.into_native().as_()).std_dev(); + if mean.is_nan() || std_dev.is_nan() { Ok(Value::Null) } else { @@ -198,7 +193,7 @@ impl AggregateFunctionCreator for ScipyStatsNormPdfAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(ScipyStatsNormPdf::<$S>::default())) + Ok(Box::new(ScipyStatsNormPdf::<<$S as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -230,7 +225,7 @@ impl AggregateFunctionCreator for ScipyStatsNormPdfAccumulatorCreator { #[cfg(test)] mod test { - use datatypes::vectors::PrimitiveVector; + use datatypes::vectors::{Float64Vector, Int32Vector}; use super::*; #[test] @@ -244,12 +239,8 @@ mod test { // test update no null-value batch let mut scipy_stats_norm_pdf = ScipyStatsNormPdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-1i32), - Some(1), - Some(2), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-1i32), Some(1), Some(2)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), Some(2.0_f64), Some(2.0_f64), @@ -264,13 +255,8 @@ mod test { // test update null-value batch let mut scipy_stats_norm_pdf = ScipyStatsNormPdf::::default(); let v: Vec = vec![ - Arc::new(PrimitiveVector::::from(vec![ - Some(-2i32), - None, - Some(3), - Some(4), - ])), - Arc::new(PrimitiveVector::::from(vec![ + Arc::new(Int32Vector::from(vec![Some(-2i32), None, Some(3), Some(4)])), + Arc::new(Float64Vector::from(vec![ Some(2.0_f64), None, Some(2.0_f64), diff --git a/src/common/function/src/scalars/expression/mod.rs b/src/common/function/src/scalars/expression.rs similarity index 100% rename from src/common/function/src/scalars/expression/mod.rs rename to src/common/function/src/scalars/expression.rs diff --git a/src/common/function/src/scalars/expression/binary.rs b/src/common/function/src/scalars/expression/binary.rs index b02e46c937..d1a9db8eb9 100644 --- a/src/common/function/src/scalars/expression/binary.rs +++ b/src/common/function/src/scalars/expression/binary.rs @@ -14,10 +14,10 @@ use std::iter; +use common_query::error::Result; use datatypes::prelude::*; -use datatypes::vectors::ConstantVector; +use datatypes::vectors::{ConstantVector, Helper}; -use crate::error::Result; use crate::scalars::expression::ctx::EvalContext; pub fn scalar_binary_op( @@ -36,10 +36,9 @@ where let result = match (l.is_const(), r.is_const()) { (false, true) => { - let left: &::VectorType = unsafe { VectorHelper::static_cast(l) }; - let right: &ConstantVector = unsafe { VectorHelper::static_cast(r) }; - let right: &::VectorType = - unsafe { VectorHelper::static_cast(right.inner()) }; + let left: &::VectorType = unsafe { Helper::static_cast(l) }; + let right: &ConstantVector = unsafe { Helper::static_cast(r) }; + let right: &::VectorType = unsafe { Helper::static_cast(right.inner()) }; let b = right.get_data(0); let it = left.iter_data().map(|a| f(a, b, ctx)); @@ -47,8 +46,8 @@ where } (false, false) => { - let left: &::VectorType = unsafe { VectorHelper::static_cast(l) }; - let right: &::VectorType = unsafe { VectorHelper::static_cast(r) }; + let left: &::VectorType = unsafe { Helper::static_cast(l) }; + let right: &::VectorType = unsafe { Helper::static_cast(r) }; let it = left .iter_data() @@ -58,25 +57,22 @@ where } (true, false) => { - let left: &ConstantVector = unsafe { VectorHelper::static_cast(l) }; - let left: &::VectorType = - unsafe { VectorHelper::static_cast(left.inner()) }; + let left: &ConstantVector = unsafe { Helper::static_cast(l) }; + let left: &::VectorType = unsafe { Helper::static_cast(left.inner()) }; let a = left.get_data(0); - let right: &::VectorType = unsafe { VectorHelper::static_cast(r) }; + let right: &::VectorType = unsafe { Helper::static_cast(r) }; let it = right.iter_data().map(|b| f(a, b, ctx)); ::VectorType::from_owned_iterator(it) } (true, true) => { - let left: &ConstantVector = unsafe { VectorHelper::static_cast(l) }; - let left: &::VectorType = - unsafe { VectorHelper::static_cast(left.inner()) }; + let left: &ConstantVector = unsafe { Helper::static_cast(l) }; + let left: &::VectorType = unsafe { Helper::static_cast(left.inner()) }; let a = left.get_data(0); - let right: &ConstantVector = unsafe { VectorHelper::static_cast(r) }; - let right: &::VectorType = - unsafe { VectorHelper::static_cast(right.inner()) }; + let right: &ConstantVector = unsafe { Helper::static_cast(r) }; + let right: &::VectorType = unsafe { Helper::static_cast(right.inner()) }; let b = right.get_data(0); let it = iter::repeat(a) diff --git a/src/common/function/src/scalars/expression/ctx.rs b/src/common/function/src/scalars/expression/ctx.rs index 7910bb82b8..c6735bd1d0 100644 --- a/src/common/function/src/scalars/expression/ctx.rs +++ b/src/common/function/src/scalars/expression/ctx.rs @@ -13,8 +13,7 @@ // limitations under the License. use chrono_tz::Tz; - -use crate::error::Error; +use common_query::error::Error; pub struct EvalContext { _tz: Tz, diff --git a/src/common/function/src/scalars/expression/unary.rs b/src/common/function/src/scalars/expression/unary.rs index a3434a2b0e..0862f711e1 100644 --- a/src/common/function/src/scalars/expression/unary.rs +++ b/src/common/function/src/scalars/expression/unary.rs @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_query::error::{self, Result}; use datatypes::prelude::*; +use datatypes::vectors::Helper; use snafu::ResultExt; -use crate::error::{GetScalarVectorSnafu, Result}; use crate::scalars::expression::ctx::EvalContext; /// TODO: remove the allow_unused when it's used. @@ -28,7 +29,7 @@ pub fn scalar_unary_op( where F: Fn(Option>, &mut EvalContext) -> Option, { - let left = VectorHelper::check_get_scalar::(l).context(GetScalarVectorSnafu)?; + let left = Helper::check_get_scalar::(l).context(error::GetScalarVectorSnafu)?; let it = left.iter_data().map(|a| f(a, ctx)); let result = ::VectorType::from_owned_iterator(it); diff --git a/src/common/function/src/scalars/function.rs b/src/common/function/src/scalars/function.rs index 353f524ea9..6f70bca4a0 100644 --- a/src/common/function/src/scalars/function.rs +++ b/src/common/function/src/scalars/function.rs @@ -16,12 +16,11 @@ use std::fmt; use std::sync::Arc; use chrono_tz::Tz; +use common_query::error::Result; use common_query::prelude::Signature; use datatypes::data_type::ConcreteDataType; use datatypes::vectors::VectorRef; -use crate::error::Result; - #[derive(Clone)] pub struct FunctionContext { pub tz: Tz, diff --git a/src/common/function/src/scalars/math/mod.rs b/src/common/function/src/scalars/math.rs similarity index 100% rename from src/common/function/src/scalars/math/mod.rs rename to src/common/function/src/scalars/math.rs diff --git a/src/common/function/src/scalars/math/pow.rs b/src/common/function/src/scalars/math/pow.rs index fcbb877240..6a4e1937dd 100644 --- a/src/common/function/src/scalars/math/pow.rs +++ b/src/common/function/src/scalars/math/pow.rs @@ -15,15 +15,16 @@ use std::fmt; use std::sync::Arc; +use common_query::error::Result; use common_query::prelude::{Signature, Volatility}; use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; +use datatypes::types::LogicalPrimitiveType; use datatypes::vectors::VectorRef; use datatypes::with_match_primitive_type_id; use num::traits::Pow; use num_traits::AsPrimitive; -use crate::error::Result; use crate::scalars::expression::{scalar_binary_op, EvalContext}; use crate::scalars::function::{Function, FunctionContext}; @@ -46,7 +47,7 @@ impl Function for PowFunction { fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| { with_match_primitive_type_id!(columns[1].data_type().logical_type_id(), |$T| { - let col = scalar_binary_op::<$S, $T, f64, _>(&columns[0], &columns[1], scalar_pow, &mut EvalContext::default())?; + let col = scalar_binary_op::<<$S as LogicalPrimitiveType>::Native, <$T as LogicalPrimitiveType>::Native, f64, _>(&columns[0], &columns[1], scalar_pow, &mut EvalContext::default())?; Ok(Arc::new(col)) },{ unreachable!() diff --git a/src/common/function/src/scalars/math/rate.rs b/src/common/function/src/scalars/math/rate.rs index 628a19408a..ad03485a36 100644 --- a/src/common/function/src/scalars/math/rate.rs +++ b/src/common/function/src/scalars/math/rate.rs @@ -14,10 +14,10 @@ use std::fmt; -use arrow::array::Array; -use common_query::error::{FromArrowArraySnafu, Result, TypeCastSnafu}; +use common_query::error::{self, Result}; use common_query::prelude::{Signature, Volatility}; -use datatypes::arrow; +use datatypes::arrow::compute::kernels::{arithmetic, cast}; +use datatypes::arrow::datatypes::DataType; use datatypes::prelude::*; use datatypes::vectors::{Helper, VectorRef}; use snafu::ResultExt; @@ -51,28 +51,21 @@ impl Function for RateFunction { let val = &columns[0].to_arrow_array(); let val_0 = val.slice(0, val.len() - 1); let val_1 = val.slice(1, val.len() - 1); - let dv = arrow::compute::arithmetics::sub(&*val_1, &*val_0); + let dv = arithmetic::subtract_dyn(&val_1, &val_0).context(error::ArrowComputeSnafu)?; let ts = &columns[1].to_arrow_array(); let ts_0 = ts.slice(0, ts.len() - 1); let ts_1 = ts.slice(1, ts.len() - 1); - let dt = arrow::compute::arithmetics::sub(&*ts_1, &*ts_0); - fn all_to_f64(array: &dyn Array) -> Result> { - Ok(arrow::compute::cast::cast( - array, - &arrow::datatypes::DataType::Float64, - arrow::compute::cast::CastOptions { - wrapped: true, - partial: true, - }, - ) - .context(TypeCastSnafu { - typ: arrow::datatypes::DataType::Float64, - })?) - } - let dv = all_to_f64(&*dv)?; - let dt = all_to_f64(&*dt)?; - let rate = arrow::compute::arithmetics::div(&*dv, &*dt); - let v = Helper::try_into_vector(&rate).context(FromArrowArraySnafu)?; + let dt = arithmetic::subtract_dyn(&ts_1, &ts_0).context(error::ArrowComputeSnafu)?; + + let dv = cast::cast(&dv, &DataType::Float64).context(error::TypeCastSnafu { + typ: DataType::Float64, + })?; + let dt = cast::cast(&dt, &DataType::Float64).context(error::TypeCastSnafu { + typ: DataType::Float64, + })?; + let rate = arithmetic::divide_dyn(&dv, &dt).context(error::ArrowComputeSnafu)?; + let v = Helper::try_into_vector(&rate).context(error::FromArrowArraySnafu)?; + Ok(v) } } @@ -81,9 +74,8 @@ impl Function for RateFunction { mod tests { use std::sync::Arc; - use arrow::array::Float64Array; use common_query::prelude::TypeSignature; - use datatypes::vectors::{Float32Vector, Int64Vector}; + use datatypes::vectors::{Float32Vector, Float64Vector, Int64Vector}; use super::*; #[test] @@ -108,9 +100,7 @@ mod tests { Arc::new(Int64Vector::from_vec(ts)), ]; let vector = rate.eval(FunctionContext::default(), &args).unwrap(); - let arr = vector.to_arrow_array(); - let expect = Arc::new(Float64Array::from_vec(vec![2.0, 3.0])); - let res = arrow::compute::comparison::eq(&*arr, &*expect); - res.iter().for_each(|x| assert!(matches!(x, Some(true)))); + let expect: VectorRef = Arc::new(Float64Vector::from_vec(vec![2.0, 3.0])); + assert_eq!(expect, vector); } } diff --git a/src/common/function/src/scalars/numpy/mod.rs b/src/common/function/src/scalars/numpy.rs similarity index 98% rename from src/common/function/src/scalars/numpy/mod.rs rename to src/common/function/src/scalars/numpy.rs index 76140fb7de..ed8d9b6f30 100644 --- a/src/common/function/src/scalars/numpy/mod.rs +++ b/src/common/function/src/scalars/numpy.rs @@ -13,7 +13,6 @@ // limitations under the License. mod clip; -#[allow(unused)] mod interp; use std::sync::Arc; diff --git a/src/common/function/src/scalars/numpy/clip.rs b/src/common/function/src/scalars/numpy/clip.rs index f743bf5ff5..888a080f3f 100644 --- a/src/common/function/src/scalars/numpy/clip.rs +++ b/src/common/function/src/scalars/numpy/clip.rs @@ -15,14 +15,15 @@ use std::fmt; use std::sync::Arc; +use common_query::error::Result; use common_query::prelude::{Signature, Volatility}; -use datatypes::data_type::{ConcreteDataType, DataType}; -use datatypes::prelude::{Scalar, VectorRef}; -use datatypes::with_match_primitive_type_id; -use num_traits::AsPrimitive; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::ArrowPrimitiveType; +use datatypes::data_type::ConcreteDataType; +use datatypes::prelude::*; +use datatypes::vectors::PrimitiveVector; use paste::paste; -use crate::error::Result; use crate::scalars::expression::{scalar_binary_op, EvalContext}; use crate::scalars::function::{Function, FunctionContext}; @@ -34,25 +35,32 @@ macro_rules! define_eval { ($O: ident) => { paste! { fn [](columns: &[VectorRef]) -> Result { - with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| { - with_match_primitive_type_id!(columns[1].data_type().logical_type_id(), |$T| { - with_match_primitive_type_id!(columns[2].data_type().logical_type_id(), |$R| { - // clip(a, min, max) is equals to min(max(a, min), max) - let col: VectorRef = Arc::new(scalar_binary_op::<$S, $T, $O, _>(&columns[0], &columns[1], scalar_max, &mut EvalContext::default())?); - let col = scalar_binary_op::<$O, $R, $O, _>(&col, &columns[2], scalar_min, &mut EvalContext::default())?; - Ok(Arc::new(col)) - }, { - unreachable!() - }) - }, { - unreachable!() - }) - }, { - unreachable!() - }) + fn cast_vector(input: &VectorRef) -> VectorRef { + Arc::new(PrimitiveVector::<<$O as WrapperType>::LogicalType>::try_from_arrow_array( + compute::cast(&input.to_arrow_array(), &<<<$O as WrapperType>::LogicalType as LogicalPrimitiveType>::ArrowPrimitive as ArrowPrimitiveType>::DATA_TYPE).unwrap() + ).unwrap()) as _ + } + let operator_1 = cast_vector(&columns[0]); + let operator_2 = cast_vector(&columns[1]); + let operator_3 = cast_vector(&columns[2]); + + // clip(a, min, max) is equals to min(max(a, min), max) + let col: VectorRef = Arc::new(scalar_binary_op::<$O, $O, $O, _>( + &operator_1, + &operator_2, + scalar_max, + &mut EvalContext::default(), + )?); + let col = scalar_binary_op::<$O, $O, $O, _>( + &col, + &operator_3, + scalar_min, + &mut EvalContext::default(), + )?; + Ok(Arc::new(col)) } } - } + }; } define_eval!(i64); @@ -108,27 +116,23 @@ pub fn max(input: T, max: T) -> T { } #[inline] -fn scalar_min(left: Option, right: Option, _ctx: &mut EvalContext) -> Option +fn scalar_min(left: Option, right: Option, _ctx: &mut EvalContext) -> Option where - S: AsPrimitive, - T: AsPrimitive, O: Scalar + Copy + PartialOrd, { match (left, right) { - (Some(left), Some(right)) => Some(min(left.as_(), right.as_())), + (Some(left), Some(right)) => Some(min(left, right)), _ => None, } } #[inline] -fn scalar_max(left: Option, right: Option, _ctx: &mut EvalContext) -> Option +fn scalar_max(left: Option, right: Option, _ctx: &mut EvalContext) -> Option where - S: AsPrimitive, - T: AsPrimitive, O: Scalar + Copy + PartialOrd, { match (left, right) { - (Some(left), Some(right)) => Some(max(left.as_(), right.as_())), + (Some(left), Some(right)) => Some(max(left, right)), _ => None, } } @@ -143,11 +147,15 @@ impl fmt::Display for ClipFunction { mod tests { use common_query::prelude::TypeSignature; use datatypes::value::Value; - use datatypes::vectors::{ConstantVector, Float32Vector, Int32Vector, UInt32Vector}; + use datatypes::vectors::{ + ConstantVector, Float32Vector, Int16Vector, Int32Vector, Int8Vector, UInt16Vector, + UInt32Vector, UInt8Vector, + }; use super::*; + #[test] - fn test_clip_function() { + fn test_clip_signature() { let clip = ClipFunction::default(); assert_eq!("clip", clip.name()); @@ -190,16 +198,21 @@ mod tests { volatility: Volatility::Immutable } if valid_types == ConcreteDataType::numerics() )); + } + + #[test] + fn test_clip_fn_signed() { + let clip = ClipFunction::default(); // eval with signed integers let args: Vec = vec![ Arc::new(Int32Vector::from_values(0..10)), Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_vec(vec![3])), + Arc::new(Int8Vector::from_vec(vec![3])), 10, )), Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_vec(vec![6])), + Arc::new(Int16Vector::from_vec(vec![6])), 10, )), ]; @@ -217,16 +230,21 @@ mod tests { assert!(matches!(vector.get(i), Value::Int64(v) if v == 6)); } } + } + + #[test] + fn test_clip_fn_unsigned() { + let clip = ClipFunction::default(); // eval with unsigned integers let args: Vec = vec![ - Arc::new(UInt32Vector::from_values(0..10)), + Arc::new(UInt8Vector::from_values(0..10)), Arc::new(ConstantVector::new( Arc::new(UInt32Vector::from_vec(vec![3])), 10, )), Arc::new(ConstantVector::new( - Arc::new(UInt32Vector::from_vec(vec![6])), + Arc::new(UInt16Vector::from_vec(vec![6])), 10, )), ]; @@ -244,12 +262,17 @@ mod tests { assert!(matches!(vector.get(i), Value::UInt64(v) if v == 6)); } } + } + + #[test] + fn test_clip_fn_float() { + let clip = ClipFunction::default(); // eval with floats let args: Vec = vec![ - Arc::new(Int32Vector::from_values(0..10)), + Arc::new(Int8Vector::from_values(0..10)), Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_vec(vec![3])), + Arc::new(UInt32Vector::from_vec(vec![3])), 10, )), Arc::new(ConstantVector::new( diff --git a/src/common/function/src/scalars/numpy/interp.rs b/src/common/function/src/scalars/numpy/interp.rs index 68981c2556..c4bb6e9811 100644 --- a/src/common/function/src/scalars/numpy/interp.rs +++ b/src/common/function/src/scalars/numpy/interp.rs @@ -14,41 +14,18 @@ use std::sync::Arc; -use datatypes::arrow::array::PrimitiveArray; -use datatypes::arrow::compute::cast::primitive_to_primitive; -use datatypes::arrow::datatypes::DataType::Float64; +use common_query::error::{self, Result}; +use datatypes::arrow::compute::cast; +use datatypes::arrow::datatypes::DataType as ArrowDataType; use datatypes::data_type::DataType; use datatypes::prelude::ScalarVector; -use datatypes::type_id::LogicalTypeId; use datatypes::value::Value; -use datatypes::vectors::{Float64Vector, PrimitiveVector, Vector, VectorRef}; -use datatypes::{arrow, with_match_primitive_type_id}; -use snafu::{ensure, Snafu}; - -#[derive(Debug, Snafu)] -pub enum Error { - #[snafu(display( - "The length of the args is not enough, expect at least: {}, have: {}", - expect, - actual, - ))] - ArgsLenNotEnough { expect: usize, actual: usize }, - - #[snafu(display("The sample {} is empty", name))] - SampleEmpty { name: String }, - - #[snafu(display( - "The length of the len1: {} don't match the length of the len2: {}", - len1, - len2, - ))] - LenNotEquals { len1: usize, len2: usize }, -} - -pub type Result = std::result::Result; +use datatypes::vectors::{Float64Vector, Vector, VectorRef}; +use datatypes::with_match_primitive_type_id; +use snafu::{ensure, ResultExt}; /* search the biggest number that smaller than x in xp */ -fn linear_search_ascending_vector(x: Value, xp: &PrimitiveVector) -> usize { +fn linear_search_ascending_vector(x: Value, xp: &Float64Vector) -> usize { for i in 0..xp.len() { if x < xp.get(i) { return i - 1; @@ -58,7 +35,7 @@ fn linear_search_ascending_vector(x: Value, xp: &PrimitiveVector) -> usize } /* search the biggest number that smaller than x in xp */ -fn binary_search_ascending_vector(key: Value, xp: &PrimitiveVector) -> usize { +fn binary_search_ascending_vector(key: Value, xp: &Float64Vector) -> usize { let mut left = 0; let mut right = xp.len(); /* If len <= 4 use linear search. */ @@ -77,27 +54,33 @@ fn binary_search_ascending_vector(key: Value, xp: &PrimitiveVector) -> usiz left - 1 } -fn concrete_type_to_primitive_vector(arg: &VectorRef) -> Result> { +fn concrete_type_to_primitive_vector(arg: &VectorRef) -> Result { with_match_primitive_type_id!(arg.data_type().logical_type_id(), |$S| { let tmp = arg.to_arrow_array(); - let from = tmp.as_any().downcast_ref::>().expect("cast failed"); - let array = primitive_to_primitive(from, &Float64); - Ok(PrimitiveVector::new(array)) + let array = cast(&tmp, &ArrowDataType::Float64).context(error::TypeCastSnafu { + typ: ArrowDataType::Float64, + })?; + // Safety: array has been cast to Float64Array. + Ok(Float64Vector::try_from_arrow_array(array).unwrap()) },{ unreachable!() }) } /// https://github.com/numpy/numpy/blob/b101756ac02e390d605b2febcded30a1da50cc2c/numpy/core/src/multiarray/compiled_base.c#L491 +#[allow(unused)] pub fn interp(args: &[VectorRef]) -> Result { let mut left = None; let mut right = None; ensure!( args.len() >= 3, - ArgsLenNotEnoughSnafu { - expect: 3_usize, - actual: args.len() + error::InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not enough, expect at least: {}, have: {}", + 3, + args.len() + ), } ); @@ -109,9 +92,12 @@ pub fn interp(args: &[VectorRef]) -> Result { if args.len() > 3 { ensure!( args.len() == 5, - ArgsLenNotEnoughSnafu { - expect: 5_usize, - actual: args.len() + error::InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not enough, expect at least: {}, have: {}", + 5, + args.len() + ), } ); @@ -123,14 +109,32 @@ pub fn interp(args: &[VectorRef]) -> Result { .get_data(0); } - ensure!(x.len() != 0, SampleEmptySnafu { name: "x" }); - ensure!(xp.len() != 0, SampleEmptySnafu { name: "xp" }); - ensure!(fp.len() != 0, SampleEmptySnafu { name: "fp" }); + ensure!( + x.len() != 0, + error::InvalidFuncArgsSnafu { + err_msg: "The sample x is empty", + } + ); + ensure!( + xp.len() != 0, + error::InvalidFuncArgsSnafu { + err_msg: "The sample xp is empty", + } + ); + ensure!( + fp.len() != 0, + error::InvalidFuncArgsSnafu { + err_msg: "The sample fp is empty", + } + ); ensure!( xp.len() == fp.len(), - LenNotEqualsSnafu { - len1: xp.len(), - len2: fp.len(), + error::InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the len1: {} don't match the length of the len2: {}", + xp.len(), + fp.len() + ), } ); @@ -147,7 +151,7 @@ pub fn interp(args: &[VectorRef]) -> Result { let res; if xp.len() == 1 { - res = x + let datas = x .iter_data() .map(|x| { if Value::from(x) < xp.get(0) { @@ -158,7 +162,8 @@ pub fn interp(args: &[VectorRef]) -> Result { fp.get_data(0) } }) - .collect::(); + .collect::>(); + res = Float64Vector::from(datas); } else { let mut j = 0; /* only pre-calculate slopes if there are relatively few of them. */ @@ -185,7 +190,7 @@ pub fn interp(args: &[VectorRef]) -> Result { } slopes = Some(slopes_tmp); } - res = x + let datas = x .iter_data() .map(|x| match x { Some(xi) => { @@ -248,7 +253,8 @@ pub fn interp(args: &[VectorRef]) -> Result { } _ => None, }) - .collect::(); + .collect::>(); + res = Float64Vector::from(datas); } Ok(Arc::new(res) as _) } @@ -257,8 +263,7 @@ pub fn interp(args: &[VectorRef]) -> Result { mod tests { use std::sync::Arc; - use datatypes::prelude::ScalarVectorBuilder; - use datatypes::vectors::{Int32Vector, Int64Vector, PrimitiveVectorBuilder}; + use datatypes::vectors::{Int32Vector, Int64Vector}; use super::*; #[test] @@ -341,12 +346,8 @@ mod tests { assert!(matches!(vector.get(0), Value::Float64(v) if v==x[0] as f64)); // x=None output:Null - let input = [None, Some(0.0), Some(0.3)]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let x = builder.finish(); + let input = vec![None, Some(0.0), Some(0.3)]; + let x = Float64Vector::from(input); let args: Vec = vec![ Arc::new(x), Arc::new(Int64Vector::from_vec(xp)), diff --git a/src/common/function/src/scalars/test.rs b/src/common/function/src/scalars/test.rs index 7d74ff5d83..8e81d1f025 100644 --- a/src/common/function/src/scalars/test.rs +++ b/src/common/function/src/scalars/test.rs @@ -15,11 +15,11 @@ use std::fmt; use std::sync::Arc; +use common_query::error::Result; use common_query::prelude::{Signature, Volatility}; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::VectorRef; -use crate::error::Result; use crate::scalars::expression::{scalar_binary_op, EvalContext}; use crate::scalars::function::{Function, FunctionContext}; diff --git a/src/common/function/src/scalars/timestamp/mod.rs b/src/common/function/src/scalars/timestamp.rs similarity index 100% rename from src/common/function/src/scalars/timestamp/mod.rs rename to src/common/function/src/scalars/timestamp.rs diff --git a/src/common/function/src/scalars/timestamp/from_unixtime.rs b/src/common/function/src/scalars/timestamp/from_unixtime.rs index 4462672c8c..c8adc01f8c 100644 --- a/src/common/function/src/scalars/timestamp/from_unixtime.rs +++ b/src/common/function/src/scalars/timestamp/from_unixtime.rs @@ -17,16 +17,17 @@ use std::fmt; use std::sync::Arc; -use common_query::error::{IntoVectorSnafu, UnsupportedInputDataTypeSnafu}; +use common_query::error::{ + ArrowComputeSnafu, IntoVectorSnafu, Result, TypeCastSnafu, UnsupportedInputDataTypeSnafu, +}; use common_query::prelude::{Signature, Volatility}; -use datatypes::arrow::compute::arithmetics; -use datatypes::arrow::datatypes::DataType as ArrowDatatype; -use datatypes::arrow::scalar::PrimitiveScalar; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::{DataType as ArrowDatatype, Int64Type}; +use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; -use datatypes::vectors::{TimestampVector, VectorRef}; +use datatypes::vectors::{TimestampMillisecondVector, VectorRef}; use snafu::ResultExt; -use crate::error::Result; use crate::scalars::function::{Function, FunctionContext}; #[derive(Clone, Debug, Default)] @@ -40,7 +41,7 @@ impl Function for FromUnixtimeFunction { } fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { - Ok(ConcreteDataType::timestamp_millis_datatype()) + Ok(ConcreteDataType::timestamp_millisecond_datatype()) } fn signature(&self) -> Signature { @@ -56,14 +57,18 @@ impl Function for FromUnixtimeFunction { ConcreteDataType::Int64(_) => { let array = columns[0].to_arrow_array(); // Our timestamp vector's time unit is millisecond - let array = arithmetics::mul_scalar( - &*array, - &PrimitiveScalar::new(ArrowDatatype::Int64, Some(1000i64)), - ); + let array = compute::multiply_scalar_dyn::(&array, 1000i64) + .context(ArrowComputeSnafu)?; + let arrow_datatype = &self.return_type(&[]).unwrap().as_arrow_type(); Ok(Arc::new( - TimestampVector::try_from_arrow_array(array).context(IntoVectorSnafu { - data_type: ArrowDatatype::Int64, + TimestampMillisecondVector::try_from_arrow_array( + compute::cast(&array, arrow_datatype).context(TypeCastSnafu { + typ: ArrowDatatype::Int64, + })?, + ) + .context(IntoVectorSnafu { + data_type: arrow_datatype.clone(), })?, )) } @@ -71,8 +76,7 @@ impl Function for FromUnixtimeFunction { function: NAME, datatypes: columns.iter().map(|c| c.data_type()).collect::>(), } - .fail() - .map_err(|e| e.into()), + .fail(), } } } @@ -96,7 +100,7 @@ mod tests { let f = FromUnixtimeFunction::default(); assert_eq!("from_unixtime", f.name()); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), f.return_type(&[]).unwrap() ); diff --git a/src/common/function/src/scalars/udf.rs b/src/common/function/src/scalars/udf.rs index b2d47af34d..f6a7dcee87 100644 --- a/src/common/function/src/scalars/udf.rs +++ b/src/common/function/src/scalars/udf.rs @@ -19,7 +19,8 @@ use common_query::prelude::{ ColumnarValue, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUdf, ScalarValue, }; use datatypes::error::Error as DataTypeError; -use datatypes::prelude::{ConcreteDataType, VectorHelper}; +use datatypes::prelude::*; +use datatypes::vectors::Helper; use snafu::ResultExt; use crate::scalars::function::{FunctionContext, FunctionRef}; @@ -47,7 +48,7 @@ pub fn create_udf(func: FunctionRef) -> ScalarUdf { let args: Result, DataTypeError> = args .iter() .map(|arg| match arg { - ColumnarValue::Scalar(v) => VectorHelper::try_from_scalar_value(v.clone(), rows), + ColumnarValue::Scalar(v) => Helper::try_from_scalar_value(v.clone(), rows), ColumnarValue::Vector(v) => Ok(v.clone()), }) .collect(); diff --git a/src/common/grpc-expr/src/insert.rs b/src/common/grpc-expr/src/insert.rs index d7687d0789..f968ff9b56 100644 --- a/src/common/grpc-expr/src/insert.rs +++ b/src/common/grpc-expr/src/insert.rs @@ -22,11 +22,11 @@ use api::v1::{AddColumn, AddColumns, Column, ColumnDataType, ColumnDef, CreateEx use common_base::BitVec; use common_time::timestamp::Timestamp; use common_time::{Date, DateTime}; -use datatypes::data_type::ConcreteDataType; +use datatypes::data_type::{ConcreteDataType, DataType}; use datatypes::prelude::{ValueRef, VectorRef}; use datatypes::schema::SchemaRef; use datatypes::value::Value; -use datatypes::vectors::VectorBuilder; +use datatypes::vectors::MutableVector; use snafu::{ensure, OptionExt, ResultExt}; use table::metadata::TableId; use table::requests::{AddColumnRequest, AlterKind, AlterTableRequest, InsertRequest}; @@ -99,7 +99,7 @@ pub fn column_to_vector(column: &Column, rows: u32) -> Result { let column_datatype = wrapper.datatype(); let rows = rows as usize; - let mut vector = VectorBuilder::with_capacity(wrapper.into(), rows); + let mut vector = ConcreteDataType::from(wrapper).create_mutable_vector(rows); if let Some(values) = &column.values { let values = collect_column_values(column_datatype, values); @@ -110,21 +110,31 @@ pub fn column_to_vector(column: &Column, rows: u32) -> Result { for i in 0..rows { if let Some(true) = nulls_iter.next() { - vector.push_null(); + vector + .push_value_ref(ValueRef::Null) + .context(CreateVectorSnafu)?; } else { - let value_ref = values_iter.next().context(InvalidColumnProtoSnafu { - err_msg: format!( - "value not found at position {} of column {}", - i, &column.column_name - ), - })?; - vector.try_push_ref(value_ref).context(CreateVectorSnafu)?; + let value_ref = values_iter + .next() + .with_context(|| InvalidColumnProtoSnafu { + err_msg: format!( + "value not found at position {} of column {}", + i, &column.column_name + ), + })?; + vector + .push_value_ref(value_ref) + .context(CreateVectorSnafu)?; } } } else { - (0..rows).for_each(|_| vector.push_null()); + (0..rows).try_for_each(|_| { + vector + .push_value_ref(ValueRef::Null) + .context(CreateVectorSnafu) + })?; } - Ok(vector.finish()) + Ok(vector.to_vector()) } fn collect_column_values(column_datatype: ColumnDataType, values: &Values) -> Vec { @@ -174,9 +184,24 @@ fn collect_column_values(column_datatype: ColumnDataType, values: &Values) -> Ve DateTime::new(*v) )) } - ColumnDataType::Timestamp => { - collect_values!(values.ts_millis_values, |v| ValueRef::Timestamp( - Timestamp::from_millis(*v) + ColumnDataType::TimestampSecond => { + collect_values!(values.ts_second_values, |v| ValueRef::Timestamp( + Timestamp::new_second(*v) + )) + } + ColumnDataType::TimestampMillisecond => { + collect_values!(values.ts_millisecond_values, |v| ValueRef::Timestamp( + Timestamp::new_millisecond(*v) + )) + } + ColumnDataType::TimestampMicrosecond => { + collect_values!(values.ts_millisecond_values, |v| ValueRef::Timestamp( + Timestamp::new_microsecond(*v) + )) + } + ColumnDataType::TimestampNanosecond => { + collect_values!(values.ts_millisecond_values, |v| ValueRef::Timestamp( + Timestamp::new_nanosecond(*v) )) } } @@ -289,10 +314,7 @@ pub fn insertion_expr_to_request( }, )?; let data_type = &column_schema.data_type; - entry.insert(VectorBuilder::with_capacity( - data_type.clone(), - row_count as usize, - )) + entry.insert(data_type.create_mutable_vector(row_count as usize)) } }; add_values_to_builder(vector_builder, values, row_count as usize, null_mask)?; @@ -300,7 +322,7 @@ pub fn insertion_expr_to_request( } let columns_values = columns_builders .into_iter() - .map(|(column_name, mut vector_builder)| (column_name, vector_builder.finish())) + .map(|(column_name, mut vector_builder)| (column_name, vector_builder.to_vector())) .collect(); Ok(InsertRequest { @@ -312,7 +334,7 @@ pub fn insertion_expr_to_request( } fn add_values_to_builder( - builder: &mut VectorBuilder, + builder: &mut Box, values: Values, row_count: usize, null_mask: Vec, @@ -323,9 +345,11 @@ fn add_values_to_builder( if null_mask.is_empty() { ensure!(values.len() == row_count, IllegalInsertDataSnafu); - values.iter().for_each(|value| { - builder.push(value); - }); + values.iter().try_for_each(|value| { + builder + .push_value_ref(value.as_value_ref()) + .context(CreateVectorSnafu) + })?; } else { let null_mask = BitVec::from_vec(null_mask); ensure!( @@ -336,9 +360,13 @@ fn add_values_to_builder( let mut idx_of_values = 0; for idx in 0..row_count { match is_null(&null_mask, idx) { - Some(true) => builder.push(&Value::Null), + Some(true) => builder + .push_value_ref(ValueRef::Null) + .context(CreateVectorSnafu)?, _ => { - builder.push(&values[idx_of_values]); + builder + .push_value_ref(values[idx_of_values].as_value_ref()) + .context(CreateVectorSnafu)?; idx_of_values += 1 } } @@ -418,9 +446,9 @@ fn convert_values(data_type: &ConcreteDataType, values: Values) -> Vec { .map(|v| Value::Date(v.into())) .collect(), ConcreteDataType::Timestamp(_) => values - .ts_millis_values + .ts_millisecond_values .into_iter() - .map(|v| Value::Timestamp(Timestamp::from_millis(v))) + .map(|v| Value::Timestamp(Timestamp::new_millisecond(v))) .collect(), ConcreteDataType::Null(_) => unreachable!(), ConcreteDataType::List(_) => unreachable!(), @@ -543,7 +571,7 @@ mod tests { ); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ConcreteDataType::from( ColumnDataTypeWrapper::try_new( column_defs @@ -624,8 +652,8 @@ mod tests { assert_eq!(Value::Float64(0.1.into()), memory.get(1)); let ts = insert_req.columns_values.get("ts").unwrap(); - assert_eq!(Value::Timestamp(Timestamp::from_millis(100)), ts.get(0)); - assert_eq!(Value::Timestamp(Timestamp::from_millis(101)), ts.get(1)); + assert_eq!(Value::Timestamp(Timestamp::new_millisecond(100)), ts.get(0)); + assert_eq!(Value::Timestamp(Timestamp::new_millisecond(101)), ts.get(1)); } #[test] @@ -675,8 +703,12 @@ mod tests { ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), ColumnSchema::new("cpu", ConcreteDataType::float64_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), true) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + true, + ) + .with_time_index(true), ]; Arc::new( @@ -741,7 +773,7 @@ mod tests { }; let ts_vals = column::Values { - ts_millis_values: vec![100, 101], + ts_millisecond_values: vec![100, 101], ..Default::default() }; let ts_column = Column { @@ -749,7 +781,7 @@ mod tests { semantic_type: TIMESTAMP_SEMANTIC_TYPE, values: Some(ts_vals), null_mask: vec![0], - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, }; ( diff --git a/src/common/grpc/Cargo.toml b/src/common/grpc/Cargo.toml index f1a60addba..b1b5a25b6e 100644 --- a/src/common/grpc/Cargo.toml +++ b/src/common/grpc/Cargo.toml @@ -13,9 +13,7 @@ common-query = { path = "../query" } common-recordbatch = { path = "../recordbatch" } common-runtime = { path = "../runtime" } dashmap = "5.4" -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../../datatypes" } snafu = { version = "0.7", features = ["backtraces"] } tokio = { version = "1.0", features = ["full"] } diff --git a/src/common/grpc/src/select.rs b/src/common/grpc/src/select.rs index 516f697d3b..3a572ab137 100644 --- a/src/common/grpc/src/select.rs +++ b/src/common/grpc/src/select.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use api::helper::ColumnDataTypeWrapper; use api::result::{build_err_result, ObjectResultBuilder}; use api::v1::codec::SelectResult; @@ -24,9 +22,14 @@ use common_error::prelude::ErrorExt; use common_error::status_code::StatusCode; use common_query::Output; use common_recordbatch::{RecordBatches, SendableRecordBatchStream}; -use datatypes::arrow::array::{Array, BooleanArray, PrimitiveArray}; -use datatypes::arrow_array::{BinaryArray, StringArray}; use datatypes::schema::SchemaRef; +use datatypes::types::{TimestampType, WrapperType}; +use datatypes::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, Float32Vector, Float64Vector, + Int16Vector, Int32Vector, Int64Vector, Int8Vector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, +}; use snafu::{OptionExt, ResultExt}; use crate::error::{self, ConversionSnafu, Result}; @@ -46,6 +49,7 @@ pub async fn to_object_result(output: std::result::Result Err(e) => build_err_result(&e), } } + async fn collect(stream: SendableRecordBatchStream) -> Result { let recordbatches = RecordBatches::try_collect(stream) .await @@ -78,10 +82,7 @@ fn try_convert(record_batches: RecordBatches) -> Result { let schema = record_batches.schema(); let record_batches = record_batches.take(); - let row_count: usize = record_batches - .iter() - .map(|r| r.df_recordbatch.num_rows()) - .sum(); + let row_count: usize = record_batches.iter().map(|r| r.num_rows()).sum(); let schemas = schema.column_schemas(); let mut columns = Vec::with_capacity(schemas.len()); @@ -89,9 +90,9 @@ fn try_convert(record_batches: RecordBatches) -> Result { for (idx, column_schema) in schemas.iter().enumerate() { let column_name = column_schema.name.clone(); - let arrays: Vec> = record_batches + let arrays: Vec<_> = record_batches .iter() - .map(|r| r.df_recordbatch.columns()[idx].clone()) + .map(|r| r.column(idx).clone()) .collect(); let column = Column { @@ -112,7 +113,7 @@ fn try_convert(record_batches: RecordBatches) -> Result { }) } -pub fn null_mask(arrays: &Vec>, row_count: usize) -> Vec { +pub fn null_mask(arrays: &[VectorRef], row_count: usize) -> Vec { let null_count: usize = arrays.iter().map(|a| a.null_count()).sum(); if null_count == 0 { @@ -122,10 +123,12 @@ pub fn null_mask(arrays: &Vec>, row_count: usize) -> Vec { let mut null_mask = BitVec::with_capacity(row_count); for array in arrays { let validity = array.validity(); - if let Some(v) = validity { - v.iter().for_each(|x| null_mask.push(!x)); - } else { + if validity.is_all_valid() { null_mask.extend_from_bitslice(&BitVec::repeat(false, array.len())); + } else { + for i in 0..array.len() { + null_mask.push(!validity.is_set(i)); + } } } null_mask.into_vec() @@ -133,7 +136,9 @@ pub fn null_mask(arrays: &Vec>, row_count: usize) -> Vec { macro_rules! convert_arrow_array_to_grpc_vals { ($data_type: expr, $arrays: ident, $(($Type: pat, $CastType: ty, $field: ident, $MapFunction: expr)), +) => {{ - use datatypes::arrow::datatypes::{DataType, TimeUnit}; + use datatypes::data_type::{ConcreteDataType}; + use datatypes::prelude::ScalarVector; + match $data_type { $( $Type => { @@ -143,52 +148,114 @@ macro_rules! convert_arrow_array_to_grpc_vals { from: format!("{:?}", $data_type), })?; vals.$field.extend(array - .iter() + .iter_data() .filter_map(|i| i.map($MapFunction)) .collect::>()); } return Ok(vals); }, )+ - _ => unimplemented!(), + ConcreteDataType::Null(_) | ConcreteDataType::List(_) => unreachable!("Should not send {:?} in gRPC", $data_type), } }}; } -pub fn values(arrays: &[Arc]) -> Result { +pub fn values(arrays: &[VectorRef]) -> Result { if arrays.is_empty() { return Ok(Values::default()); } let data_type = arrays[0].data_type(); convert_arrow_array_to_grpc_vals!( - data_type, arrays, - - (DataType::Boolean, BooleanArray, bool_values, |x| {x}), - - (DataType::Int8, PrimitiveArray, i8_values, |x| {*x as i32}), - (DataType::Int16, PrimitiveArray, i16_values, |x| {*x as i32}), - (DataType::Int32, PrimitiveArray, i32_values, |x| {*x}), - (DataType::Int64, PrimitiveArray, i64_values, |x| {*x}), - - (DataType::UInt8, PrimitiveArray, u8_values, |x| {*x as u32}), - (DataType::UInt16, PrimitiveArray, u16_values, |x| {*x as u32}), - (DataType::UInt32, PrimitiveArray, u32_values, |x| {*x}), - (DataType::UInt64, PrimitiveArray, u64_values, |x| {*x}), - - (DataType::Float32, PrimitiveArray, f32_values, |x| {*x}), - (DataType::Float64, PrimitiveArray, f64_values, |x| {*x}), - - (DataType::Binary, BinaryArray, binary_values, |x| {x.into()}), - (DataType::LargeBinary, BinaryArray, binary_values, |x| {x.into()}), - - (DataType::Utf8, StringArray, string_values, |x| {x.into()}), - (DataType::LargeUtf8, StringArray, string_values, |x| {x.into()}), - - (DataType::Date32, PrimitiveArray, date_values, |x| {*x as i32}), - (DataType::Date64, PrimitiveArray, datetime_values,|x| {*x as i64}), - - (DataType::Timestamp(TimeUnit::Millisecond, _), PrimitiveArray, ts_millis_values, |x| {*x}) + data_type, + arrays, + ( + ConcreteDataType::Boolean(_), + BooleanVector, + bool_values, + |x| { x } + ), + (ConcreteDataType::Int8(_), Int8Vector, i8_values, |x| { + i32::from(x) + }), + (ConcreteDataType::Int16(_), Int16Vector, i16_values, |x| { + i32::from(x) + }), + (ConcreteDataType::Int32(_), Int32Vector, i32_values, |x| { + x + }), + (ConcreteDataType::Int64(_), Int64Vector, i64_values, |x| { + x + }), + (ConcreteDataType::UInt8(_), UInt8Vector, u8_values, |x| { + u32::from(x) + }), + (ConcreteDataType::UInt16(_), UInt16Vector, u16_values, |x| { + u32::from(x) + }), + (ConcreteDataType::UInt32(_), UInt32Vector, u32_values, |x| { + x + }), + (ConcreteDataType::UInt64(_), UInt64Vector, u64_values, |x| { + x + }), + ( + ConcreteDataType::Float32(_), + Float32Vector, + f32_values, + |x| { x } + ), + ( + ConcreteDataType::Float64(_), + Float64Vector, + f64_values, + |x| { x } + ), + ( + ConcreteDataType::Binary(_), + BinaryVector, + binary_values, + |x| { x.into() } + ), + ( + ConcreteDataType::String(_), + StringVector, + string_values, + |x| { x.into() } + ), + (ConcreteDataType::Date(_), DateVector, date_values, |x| { + x.val() + }), + ( + ConcreteDataType::DateTime(_), + DateTimeVector, + datetime_values, + |x| { x.val() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Second(_)), + TimestampSecondVector, + ts_second_values, + |x| { x.into_native() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Millisecond(_)), + TimestampMillisecondVector, + ts_millisecond_values, + |x| { x.into_native() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Microsecond(_)), + TimestampMicrosecondVector, + ts_microsecond_values, + |x| { x.into_native() } + ), + ( + ConcreteDataType::Timestamp(TimestampType::Nanosecond(_)), + TimestampNanosecondVector, + ts_nanosecond_values, + |x| { x.into_native() } + ) ) } @@ -197,14 +264,10 @@ mod tests { use std::sync::Arc; use common_recordbatch::{RecordBatch, RecordBatches}; - use datafusion::field_util::SchemaExt; - use datatypes::arrow::array::{Array, BooleanArray, PrimitiveArray}; - use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use datatypes::arrow_array::StringArray; - use datatypes::schema::Schema; - use datatypes::vectors::{UInt32Vector, VectorRef}; + use datatypes::data_type::ConcreteDataType; + use datatypes::schema::{ColumnSchema, Schema}; - use crate::select::{null_mask, try_convert, values}; + use super::*; #[test] fn test_convert_record_batches_to_select_result() { @@ -230,9 +293,8 @@ mod tests { #[test] fn test_convert_arrow_arrays_i32() { - let array: PrimitiveArray = - PrimitiveArray::from(vec![Some(1), Some(2), None, Some(3)]); - let array: Arc = Arc::new(array); + let array = Int32Vector::from(vec![Some(1), Some(2), None, Some(3)]); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); @@ -241,14 +303,14 @@ mod tests { #[test] fn test_convert_arrow_arrays_string() { - let array = StringArray::from(vec![ + let array = StringVector::from(vec![ Some("1".to_string()), Some("2".to_string()), None, Some("3".to_string()), None, ]); - let array: Arc = Arc::new(array); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); @@ -257,8 +319,8 @@ mod tests { #[test] fn test_convert_arrow_arrays_bool() { - let array = BooleanArray::from(vec![Some(true), Some(false), None, Some(false), None]); - let array: Arc = Arc::new(array); + let array = BooleanVector::from(vec![Some(true), Some(false), None, Some(false), None]); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); @@ -267,43 +329,42 @@ mod tests { #[test] fn test_convert_arrow_arrays_empty() { - let array = BooleanArray::from(vec![None, None, None, None, None]); - let array: Arc = Arc::new(array); + let array = BooleanVector::from(vec![None, None, None, None, None]); + let array: VectorRef = Arc::new(array); let values = values(&[array]).unwrap(); - assert_eq!(Vec::::default(), values.bool_values); + assert!(values.bool_values.is_empty()); } #[test] fn test_null_mask() { - let a1: Arc = Arc::new(PrimitiveArray::from(vec![None, Some(2), None])); - let a2: Arc = - Arc::new(PrimitiveArray::from(vec![Some(1), Some(2), None, Some(4)])); - let mask = null_mask(&vec![a1, a2], 3 + 4); + let a1: VectorRef = Arc::new(Int32Vector::from(vec![None, Some(2), None])); + let a2: VectorRef = Arc::new(Int32Vector::from(vec![Some(1), Some(2), None, Some(4)])); + let mask = null_mask(&[a1, a2], 3 + 4); assert_eq!(vec![0b0010_0101], mask); - let empty: Arc = Arc::new(PrimitiveArray::::from(vec![None, None, None])); - let mask = null_mask(&vec![empty.clone(), empty.clone(), empty], 9); + let empty: VectorRef = Arc::new(Int32Vector::from(vec![None, None, None])); + let mask = null_mask(&[empty.clone(), empty.clone(), empty], 9); assert_eq!(vec![0b1111_1111, 0b0000_0001], mask); - let a1: Arc = Arc::new(PrimitiveArray::from(vec![Some(1), Some(2), Some(3)])); - let a2: Arc = Arc::new(PrimitiveArray::from(vec![Some(4), Some(5), Some(6)])); - let mask = null_mask(&vec![a1, a2], 3 + 3); + let a1: VectorRef = Arc::new(Int32Vector::from(vec![Some(1), Some(2), Some(3)])); + let a2: VectorRef = Arc::new(Int32Vector::from(vec![Some(4), Some(5), Some(6)])); + let mask = null_mask(&[a1, a2], 3 + 3); assert_eq!(Vec::::default(), mask); - let a1: Arc = Arc::new(PrimitiveArray::from(vec![Some(1), Some(2), Some(3)])); - let a2: Arc = Arc::new(PrimitiveArray::from(vec![Some(4), Some(5), None])); - let mask = null_mask(&vec![a1, a2], 3 + 3); + let a1: VectorRef = Arc::new(Int32Vector::from(vec![Some(1), Some(2), Some(3)])); + let a2: VectorRef = Arc::new(Int32Vector::from(vec![Some(4), Some(5), None])); + let mask = null_mask(&[a1, a2], 3 + 3); assert_eq!(vec![0b0010_0000], mask); } fn mock_record_batch() -> RecordBatch { - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("c1", DataType::UInt32, false), - Field::new("c2", DataType::UInt32, false), - ])); - let schema = Arc::new(Schema::try_from(arrow_schema).unwrap()); + let column_schemas = vec![ + ColumnSchema::new("c1", ConcreteDataType::uint32_datatype(), true), + ColumnSchema::new("c2", ConcreteDataType::uint32_datatype(), true), + ]; + let schema = Arc::new(Schema::try_new(column_schemas).unwrap()); let v1 = Arc::new(UInt32Vector::from(vec![Some(1), Some(2), None])); let v2 = Arc::new(UInt32Vector::from(vec![Some(1), None, None])); diff --git a/src/common/grpc/src/writer.rs b/src/common/grpc/src/writer.rs index 2cd28f45af..d05a2908e1 100644 --- a/src/common/grpc/src/writer.rs +++ b/src/common/grpc/src/writer.rs @@ -45,11 +45,11 @@ impl LinesWriter { pub fn write_ts(&mut self, column_name: &str, value: (i64, Precision)) -> Result<()> { let (idx, column) = self.mut_column( column_name, - ColumnDataType::Timestamp, + ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, ); ensure!( - column.datatype == ColumnDataType::Timestamp as i32, + column.datatype == ColumnDataType::TimestampMillisecond as i32, TypeMismatchSnafu { column_name, expected: "timestamp", @@ -58,7 +58,9 @@ impl LinesWriter { ); // It is safe to use unwrap here, because values has been initialized in mut_column() let values = column.values.as_mut().unwrap(); - values.ts_millis_values.push(to_ms_ts(value.1, value.0)); + values + .ts_millisecond_values + .push(to_ms_ts(value.1, value.0)); self.null_masks[idx].push(false); Ok(()) } @@ -224,23 +226,23 @@ impl LinesWriter { pub fn to_ms_ts(p: Precision, ts: i64) -> i64 { match p { - Precision::NANOSECOND => ts / 1_000_000, - Precision::MICROSECOND => ts / 1000, - Precision::MILLISECOND => ts, - Precision::SECOND => ts * 1000, - Precision::MINUTE => ts * 1000 * 60, - Precision::HOUR => ts * 1000 * 60 * 60, + Precision::Nanosecond => ts / 1_000_000, + Precision::Microsecond => ts / 1000, + Precision::Millisecond => ts, + Precision::Second => ts * 1000, + Precision::Minute => ts * 1000 * 60, + Precision::Hour => ts * 1000 * 60 * 60, } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Precision { - NANOSECOND, - MICROSECOND, - MILLISECOND, - SECOND, - MINUTE, - HOUR, + Nanosecond, + Microsecond, + Millisecond, + Second, + Minute, + Hour, } #[cfg(test)] @@ -261,13 +263,13 @@ mod tests { writer.write_f64("memory", 0.4).unwrap(); writer.write_string("name", "name1").unwrap(); writer - .write_ts("ts", (101011000, Precision::MILLISECOND)) + .write_ts("ts", (101011000, Precision::Millisecond)) .unwrap(); writer.commit(); writer.write_tag("host", "host2").unwrap(); writer - .write_ts("ts", (102011001, Precision::MILLISECOND)) + .write_ts("ts", (102011001, Precision::Millisecond)) .unwrap(); writer.write_bool("enable_reboot", true).unwrap(); writer.write_u64("year_of_service", 2).unwrap(); @@ -278,7 +280,7 @@ mod tests { writer.write_f64("cpu", 0.4).unwrap(); writer.write_u64("cpu_core_num", 16).unwrap(); writer - .write_ts("ts", (103011002, Precision::MILLISECOND)) + .write_ts("ts", (103011002, Precision::Millisecond)) .unwrap(); writer.commit(); @@ -321,11 +323,11 @@ mod tests { let column = &columns[4]; assert_eq!("ts", column.column_name); - assert_eq!(ColumnDataType::Timestamp as i32, column.datatype); + assert_eq!(ColumnDataType::TimestampMillisecond as i32, column.datatype); assert_eq!(SemanticType::Timestamp as i32, column.semantic_type); assert_eq!( vec![101011000, 102011001, 103011002], - column.values.as_ref().unwrap().ts_millis_values + column.values.as_ref().unwrap().ts_millisecond_values ); verify_null_mask(&column.null_mask, vec![false, false, false]); @@ -367,16 +369,16 @@ mod tests { #[test] fn test_to_ms() { - assert_eq!(100, to_ms_ts(Precision::NANOSECOND, 100110000)); - assert_eq!(100110, to_ms_ts(Precision::MICROSECOND, 100110000)); - assert_eq!(100110000, to_ms_ts(Precision::MILLISECOND, 100110000)); + assert_eq!(100, to_ms_ts(Precision::Nanosecond, 100110000)); + assert_eq!(100110, to_ms_ts(Precision::Microsecond, 100110000)); + assert_eq!(100110000, to_ms_ts(Precision::Millisecond, 100110000)); assert_eq!( 100110000 * 1000 * 60, - to_ms_ts(Precision::MINUTE, 100110000) + to_ms_ts(Precision::Minute, 100110000) ); assert_eq!( 100110000 * 1000 * 60 * 60, - to_ms_ts(Precision::HOUR, 100110000) + to_ms_ts(Precision::Hour, 100110000) ); } } diff --git a/src/common/query/Cargo.toml b/src/common/query/Cargo.toml index 7b9f87617b..bd8f0bbf3a 100644 --- a/src/common/query/Cargo.toml +++ b/src/common/query/Cargo.toml @@ -9,11 +9,9 @@ async-trait = "0.1" common-error = { path = "../error" } common-recordbatch = { path = "../recordbatch" } common-time = { path = "../time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" datatypes = { path = "../../datatypes" } snafu = { version = "0.7", features = ["backtraces"] } statrs = "0.15" diff --git a/src/common/query/src/error.rs b/src/common/query/src/error.rs index 7c5c224d1e..25c169baa5 100644 --- a/src/common/query/src/error.rs +++ b/src/common/query/src/error.rs @@ -23,16 +23,9 @@ use datatypes::error::Error as DataTypeError; use datatypes::prelude::ConcreteDataType; use statrs::StatsError; -common_error::define_opaque_error!(Error); - #[derive(Debug, Snafu)] #[snafu(visibility(pub))] -pub enum InnerError { - #[snafu(display("Fail to cast array to {:?}, source: {}", typ, source))] - TypeCast { - source: ArrowError, - typ: arrow::datatypes::DataType, - }, +pub enum Error { #[snafu(display("Fail to execute function, source: {}", source))] ExecuteFunction { source: DataFusionError, @@ -83,8 +76,8 @@ pub enum InnerError { backtrace: Backtrace, }, - #[snafu(display("Invalid inputs: {}", err_msg))] - InvalidInputs { + #[snafu(display("Invalid input type: {}", err_msg))] + InvalidInputType { #[snafu(backtrace)] source: DataTypeError, err_msg: String, @@ -133,37 +126,74 @@ pub enum InnerError { #[snafu(backtrace)] source: BoxedError, }, + + #[snafu(display("Failed to cast array to {:?}, source: {}", typ, source))] + TypeCast { + source: ArrowError, + typ: arrow::datatypes::DataType, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to perform compute operation on arrow arrays, source: {}", + source + ))] + ArrowCompute { + source: ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display("Query engine fail to cast value: {}", source))] + ToScalarValue { + #[snafu(backtrace)] + source: DataTypeError, + }, + + #[snafu(display("Failed to get scalar vector, {}", source))] + GetScalarVector { + #[snafu(backtrace)] + source: DataTypeError, + }, + + #[snafu(display("Invalid function args: {}", err_msg))] + InvalidFuncArgs { + err_msg: String, + backtrace: Backtrace, + }, } pub type Result = std::result::Result; -impl ErrorExt for InnerError { +impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - InnerError::ExecuteFunction { .. } - | InnerError::GenerateFunction { .. } - | InnerError::CreateAccumulator { .. } - | InnerError::DowncastVector { .. } - | InnerError::InvalidInputState { .. } - | InnerError::InvalidInputCol { .. } - | InnerError::BadAccumulatorImpl { .. } => StatusCode::EngineExecuteQuery, + Error::ExecuteFunction { .. } + | Error::GenerateFunction { .. } + | Error::CreateAccumulator { .. } + | Error::DowncastVector { .. } + | Error::InvalidInputState { .. } + | Error::InvalidInputCol { .. } + | Error::BadAccumulatorImpl { .. } + | Error::ToScalarValue { .. } + | Error::GetScalarVector { .. } + | Error::ArrowCompute { .. } => StatusCode::EngineExecuteQuery, - InnerError::InvalidInputs { source, .. } - | InnerError::IntoVector { source, .. } - | InnerError::FromScalarValue { source } - | InnerError::ConvertArrowSchema { source } - | InnerError::FromArrowArray { source } => source.status_code(), + Error::InvalidInputType { source, .. } + | Error::IntoVector { source, .. } + | Error::FromScalarValue { source } + | Error::ConvertArrowSchema { source } + | Error::FromArrowArray { source } => source.status_code(), - InnerError::ExecuteRepeatedly { .. } - | InnerError::GeneralDataFusion { .. } - | InnerError::DataFusionExecutionPlan { .. } => StatusCode::Unexpected, + Error::ExecuteRepeatedly { .. } + | Error::GeneralDataFusion { .. } + | Error::DataFusionExecutionPlan { .. } => StatusCode::Unexpected, - InnerError::UnsupportedInputDataType { .. } | InnerError::TypeCast { .. } => { - StatusCode::InvalidArguments - } + Error::UnsupportedInputDataType { .. } + | Error::TypeCast { .. } + | Error::InvalidFuncArgs { .. } => StatusCode::InvalidArguments, - InnerError::ConvertDfRecordBatchStream { source, .. } => source.status_code(), - InnerError::ExecutePhysicalPlan { source } => source.status_code(), + Error::ConvertDfRecordBatchStream { source, .. } => source.status_code(), + Error::ExecutePhysicalPlan { source } => source.status_code(), } } @@ -176,12 +206,6 @@ impl ErrorExt for InnerError { } } -impl From for Error { - fn from(e: InnerError) -> Error { - Error::new(e) - } -} - impl From for DataFusionError { fn from(e: Error) -> DataFusionError { DataFusionError::External(Box::new(e)) @@ -190,7 +214,7 @@ impl From for DataFusionError { impl From for Error { fn from(source: BoxedError) -> Self { - InnerError::ExecutePhysicalPlan { source }.into() + Error::ExecutePhysicalPlan { source } } } @@ -206,60 +230,51 @@ mod tests { } fn assert_error(err: &Error, code: StatusCode) { - let inner_err = err.as_any().downcast_ref::().unwrap(); + let inner_err = err.as_any().downcast_ref::().unwrap(); assert_eq!(code, inner_err.status_code()); assert!(inner_err.backtrace_opt().is_some()); } #[test] fn test_datafusion_as_source() { - let err: Error = throw_df_error() + let err = throw_df_error() .context(ExecuteFunctionSnafu) .err() - .unwrap() - .into(); + .unwrap(); assert_error(&err, StatusCode::EngineExecuteQuery); let err: Error = throw_df_error() .context(GeneralDataFusionSnafu) .err() - .unwrap() - .into(); + .unwrap(); assert_error(&err, StatusCode::Unexpected); - let err: Error = throw_df_error() + let err = throw_df_error() .context(DataFusionExecutionPlanSnafu) .err() - .unwrap() - .into(); + .unwrap(); assert_error(&err, StatusCode::Unexpected); } #[test] fn test_execute_repeatedly_error() { - let error: Error = None:: - .context(ExecuteRepeatedlySnafu) - .err() - .unwrap() - .into(); - assert_eq!(error.inner.status_code(), StatusCode::Unexpected); + let error = None::.context(ExecuteRepeatedlySnafu).err().unwrap(); + assert_eq!(error.status_code(), StatusCode::Unexpected); assert!(error.backtrace_opt().is_some()); } #[test] fn test_convert_df_recordbatch_stream_error() { let result: std::result::Result = - Err(common_recordbatch::error::InnerError::PollStream { - source: ArrowError::Overflow, + Err(common_recordbatch::error::Error::PollStream { + source: ArrowError::DivideByZero, backtrace: Backtrace::generate(), - } - .into()); - let error: Error = result + }); + let error = result .context(ConvertDfRecordBatchStreamSnafu) .err() - .unwrap() - .into(); - assert_eq!(error.inner.status_code(), StatusCode::Internal); + .unwrap(); + assert_eq!(error.status_code(), StatusCode::Internal); assert!(error.backtrace_opt().is_some()); } @@ -272,13 +287,12 @@ mod tests { #[test] fn test_into_vector_error() { - let err: Error = raise_datatype_error() + let err = raise_datatype_error() .context(IntoVectorSnafu { data_type: ArrowDatatype::Int32, }) .err() - .unwrap() - .into(); + .unwrap(); assert!(err.backtrace_opt().is_some()); let datatype_err = raise_datatype_error().err().unwrap(); assert_eq!(datatype_err.status_code(), err.status_code()); diff --git a/src/common/query/src/logical_plan/mod.rs b/src/common/query/src/logical_plan.rs similarity index 97% rename from src/common/query/src/logical_plan/mod.rs rename to src/common/query/src/logical_plan.rs index 5f57cd96aa..a0df518ce7 100644 --- a/src/common/query/src/logical_plan/mod.rs +++ b/src/common/query/src/logical_plan.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use datatypes::prelude::ConcreteDataType; pub use self::accumulator::{Accumulator, AggregateFunctionCreator, AggregateFunctionCreatorRef}; -pub use self::expr::Expr; +pub use self::expr::{DfExpr, Expr}; pub use self::udaf::AggregateFunction; pub use self::udf::ScalarUdf; use crate::function::{ReturnTypeFunction, ScalarFunctionImplementation}; @@ -148,9 +148,7 @@ mod tests { let args = vec![ DfColumnarValue::Scalar(ScalarValue::Boolean(Some(true))), - DfColumnarValue::Array(Arc::new(BooleanArray::from_slice(vec![ - true, false, false, true, - ]))), + DfColumnarValue::Array(Arc::new(BooleanArray::from(vec![true, false, false, true]))), ]; // call the function diff --git a/src/common/query/src/logical_plan/accumulator.rs b/src/common/query/src/logical_plan/accumulator.rs index 717214f3ff..cce139094e 100644 --- a/src/common/query/src/logical_plan/accumulator.rs +++ b/src/common/query/src/logical_plan/accumulator.rs @@ -17,12 +17,10 @@ use std::fmt::Debug; use std::sync::Arc; -use common_time::timestamp::TimeUnit; use datafusion_common::Result as DfResult; -use datafusion_expr::Accumulator as DfAccumulator; +use datafusion_expr::{Accumulator as DfAccumulator, AggregateState}; use datatypes::arrow::array::ArrayRef; use datatypes::prelude::*; -use datatypes::value::ListValue; use datatypes::vectors::{Helper as VectorHelper, VectorRef}; use snafu::ResultExt; @@ -128,356 +126,53 @@ impl DfAccumulatorAdaptor { } impl DfAccumulator for DfAccumulatorAdaptor { - fn state(&self) -> DfResult> { + fn state(&self) -> DfResult> { let state_values = self.accumulator.state()?; let state_types = self.creator.state_types()?; if state_values.len() != state_types.len() { return error::BadAccumulatorImplSnafu { err_msg: format!("Accumulator {:?} returned state values size do not match its state types size.", self), } - .fail() - .map_err(Error::from)?; + .fail()?; } Ok(state_values .into_iter() .zip(state_types.iter()) - .map(|(v, t)| try_into_scalar_value(v, t)) - .collect::>>() - .map_err(Error::from)?) + .map(|(v, t)| { + let scalar = v + .try_to_scalar_value(t) + .context(error::ToScalarValueSnafu)?; + Ok(AggregateState::Scalar(scalar)) + }) + .collect::>>()?) } fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> { - let vectors = VectorHelper::try_into_vectors(values) - .context(FromScalarValueSnafu) - .map_err(Error::from)?; - self.accumulator - .update_batch(&vectors) - .map_err(|e| e.into()) + let vectors = VectorHelper::try_into_vectors(values).context(FromScalarValueSnafu)?; + self.accumulator.update_batch(&vectors)?; + Ok(()) } fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> { let mut vectors = Vec::with_capacity(states.len()); for array in states.iter() { vectors.push( - VectorHelper::try_into_vector(array) - .context(IntoVectorSnafu { - data_type: array.data_type().clone(), - }) - .map_err(Error::from)?, + VectorHelper::try_into_vector(array).context(IntoVectorSnafu { + data_type: array.data_type().clone(), + })?, ); } - self.accumulator.merge_batch(&vectors).map_err(|e| e.into()) + self.accumulator.merge_batch(&vectors)?; + Ok(()) } fn evaluate(&self) -> DfResult { let value = self.accumulator.evaluate()?; let output_type = self.creator.output_type()?; - Ok(try_into_scalar_value(value, &output_type)?) - } -} - -fn try_into_scalar_value(value: Value, datatype: &ConcreteDataType) -> Result { - if !matches!(value, Value::Null) && datatype != &value.data_type() { - return error::BadAccumulatorImplSnafu { - err_msg: format!( - "expect value to return datatype {:?}, actual: {:?}", - datatype, - value.data_type() - ), - } - .fail()?; - } - - Ok(match value { - Value::Boolean(v) => ScalarValue::Boolean(Some(v)), - Value::UInt8(v) => ScalarValue::UInt8(Some(v)), - Value::UInt16(v) => ScalarValue::UInt16(Some(v)), - Value::UInt32(v) => ScalarValue::UInt32(Some(v)), - Value::UInt64(v) => ScalarValue::UInt64(Some(v)), - Value::Int8(v) => ScalarValue::Int8(Some(v)), - Value::Int16(v) => ScalarValue::Int16(Some(v)), - Value::Int32(v) => ScalarValue::Int32(Some(v)), - Value::Int64(v) => ScalarValue::Int64(Some(v)), - Value::Float32(v) => ScalarValue::Float32(Some(v.0)), - Value::Float64(v) => ScalarValue::Float64(Some(v.0)), - Value::String(v) => ScalarValue::Utf8(Some(v.as_utf8().to_string())), - Value::Binary(v) => ScalarValue::LargeBinary(Some(v.to_vec())), - Value::Date(v) => ScalarValue::Date32(Some(v.val())), - Value::DateTime(v) => ScalarValue::Date64(Some(v.val())), - Value::Null => try_convert_null_value(datatype)?, - Value::List(list) => try_convert_list_value(list)?, - Value::Timestamp(t) => timestamp_to_scalar_value(t.unit(), Some(t.value())), - }) -} - -fn timestamp_to_scalar_value(unit: TimeUnit, val: Option) -> ScalarValue { - match unit { - TimeUnit::Second => ScalarValue::TimestampSecond(val, None), - TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(val, None), - TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(val, None), - TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(val, None), - } -} - -fn try_convert_null_value(datatype: &ConcreteDataType) -> Result { - Ok(match datatype { - ConcreteDataType::Boolean(_) => ScalarValue::Boolean(None), - ConcreteDataType::Int8(_) => ScalarValue::Int8(None), - ConcreteDataType::Int16(_) => ScalarValue::Int16(None), - ConcreteDataType::Int32(_) => ScalarValue::Int32(None), - ConcreteDataType::Int64(_) => ScalarValue::Int64(None), - ConcreteDataType::UInt8(_) => ScalarValue::UInt8(None), - ConcreteDataType::UInt16(_) => ScalarValue::UInt16(None), - ConcreteDataType::UInt32(_) => ScalarValue::UInt32(None), - ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None), - ConcreteDataType::Float32(_) => ScalarValue::Float32(None), - ConcreteDataType::Float64(_) => ScalarValue::Float64(None), - ConcreteDataType::Binary(_) => ScalarValue::LargeBinary(None), - ConcreteDataType::String(_) => ScalarValue::Utf8(None), - ConcreteDataType::Timestamp(t) => timestamp_to_scalar_value(t.unit, None), - _ => { - return error::BadAccumulatorImplSnafu { - err_msg: format!( - "undefined transition from null value to datatype {:?}", - datatype - ), - } - .fail()? - } - }) -} - -fn try_convert_list_value(list: ListValue) -> Result { - let vs = if let Some(items) = list.items() { - Some(Box::new( - items - .iter() - .map(|v| try_into_scalar_value(v.clone(), list.datatype())) - .collect::>>()?, - )) - } else { - None - }; - Ok(ScalarValue::List( - vs, - Box::new(list.datatype().as_arrow_type()), - )) -} - -#[cfg(test)] -mod tests { - use common_base::bytes::{Bytes, StringBytes}; - use datafusion_common::ScalarValue; - use datatypes::arrow::datatypes::DataType; - use datatypes::value::{ListValue, OrderedFloat}; - - use super::*; - - #[test] - fn test_not_null_value_to_scalar_value() { - assert_eq!( - ScalarValue::Boolean(Some(true)), - try_into_scalar_value(Value::Boolean(true), &ConcreteDataType::boolean_datatype()) - .unwrap() - ); - assert_eq!( - ScalarValue::Boolean(Some(false)), - try_into_scalar_value(Value::Boolean(false), &ConcreteDataType::boolean_datatype()) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt8(Some(u8::MIN + 1)), - try_into_scalar_value( - Value::UInt8(u8::MIN + 1), - &ConcreteDataType::uint8_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt16(Some(u16::MIN + 2)), - try_into_scalar_value( - Value::UInt16(u16::MIN + 2), - &ConcreteDataType::uint16_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt32(Some(u32::MIN + 3)), - try_into_scalar_value( - Value::UInt32(u32::MIN + 3), - &ConcreteDataType::uint32_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::UInt64(Some(u64::MIN + 4)), - try_into_scalar_value( - Value::UInt64(u64::MIN + 4), - &ConcreteDataType::uint64_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Int8(Some(i8::MIN + 4)), - try_into_scalar_value(Value::Int8(i8::MIN + 4), &ConcreteDataType::int8_datatype()) - .unwrap() - ); - assert_eq!( - ScalarValue::Int16(Some(i16::MIN + 5)), - try_into_scalar_value( - Value::Int16(i16::MIN + 5), - &ConcreteDataType::int16_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Int32(Some(i32::MIN + 6)), - try_into_scalar_value( - Value::Int32(i32::MIN + 6), - &ConcreteDataType::int32_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Int64(Some(i64::MIN + 7)), - try_into_scalar_value( - Value::Int64(i64::MIN + 7), - &ConcreteDataType::int64_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Float32(Some(8.0f32)), - try_into_scalar_value( - Value::Float32(OrderedFloat(8.0f32)), - &ConcreteDataType::float32_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Float64(Some(9.0f64)), - try_into_scalar_value( - Value::Float64(OrderedFloat(9.0f64)), - &ConcreteDataType::float64_datatype() - ) - .unwrap() - ); - assert_eq!( - ScalarValue::Utf8(Some("hello".to_string())), - try_into_scalar_value( - Value::String(StringBytes::from("hello")), - &ConcreteDataType::string_datatype(), - ) - .unwrap() - ); - assert_eq!( - ScalarValue::LargeBinary(Some("world".as_bytes().to_vec())), - try_into_scalar_value( - Value::Binary(Bytes::from("world".as_bytes())), - &ConcreteDataType::binary_datatype() - ) - .unwrap() - ); - } - - #[test] - fn test_null_value_to_scalar_value() { - assert_eq!( - ScalarValue::Boolean(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::boolean_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt8(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint8_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt16(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint16_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt32(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint32_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::UInt64(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::uint64_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int8(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int8_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int16(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int16_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int32(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int32_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Int64(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::int64_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Float32(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::float32_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Float64(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::float64_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::Utf8(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::string_datatype()).unwrap() - ); - assert_eq!( - ScalarValue::LargeBinary(None), - try_into_scalar_value(Value::Null, &ConcreteDataType::binary_datatype()).unwrap() - ); - } - - #[test] - fn test_list_value_to_scalar_value() { - let items = Some(Box::new(vec![Value::Int32(-1), Value::Null])); - let list = Value::List(ListValue::new(items, ConcreteDataType::int32_datatype())); - let df_list = try_into_scalar_value( - list, - &ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - ) - .unwrap(); - assert!(matches!(df_list, ScalarValue::List(_, _))); - match df_list { - ScalarValue::List(vs, datatype) => { - assert_eq!(*datatype, DataType::Int32); - - assert!(vs.is_some()); - let vs = *vs.unwrap(); - assert_eq!( - vs, - vec![ScalarValue::Int32(Some(-1)), ScalarValue::Int32(None)] - ); - } - _ => unreachable!(), - } - } - - #[test] - pub fn test_timestamp_to_scalar_value() { - assert_eq!( - ScalarValue::TimestampSecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Second, Some(1)) - ); - assert_eq!( - ScalarValue::TimestampMillisecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Millisecond, Some(1)) - ); - assert_eq!( - ScalarValue::TimestampMicrosecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Microsecond, Some(1)) - ); - assert_eq!( - ScalarValue::TimestampNanosecond(Some(1), None), - timestamp_to_scalar_value(TimeUnit::Nanosecond, Some(1)) - ); + let scalar_value = value + .try_to_scalar_value(&output_type) + .context(error::ToScalarValueSnafu) + .map_err(Error::from)?; + Ok(scalar_value) } } diff --git a/src/common/query/src/logical_plan/expr.rs b/src/common/query/src/logical_plan/expr.rs index 45cb12cdeb..cc8aa1bea3 100644 --- a/src/common/query/src/logical_plan/expr.rs +++ b/src/common/query/src/logical_plan/expr.rs @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion::logical_plan::Expr as DfExpr; +pub use datafusion_expr::expr::Expr as DfExpr; /// Central struct of query API. /// Represent logical expressions such as `A + 1`, or `CAST(c1 AS int)`. -#[derive(Clone, PartialEq, Hash, Debug)] +#[derive(Clone, PartialEq, Eq, Hash, Debug)] pub struct Expr { df_expr: DfExpr, } diff --git a/src/common/query/src/logical_plan/udaf.rs b/src/common/query/src/logical_plan/udaf.rs index 6fb4a2f68a..1f3fb26a98 100644 --- a/src/common/query/src/logical_plan/udaf.rs +++ b/src/common/query/src/logical_plan/udaf.rs @@ -104,7 +104,7 @@ fn to_df_accumulator_func( accumulator: AccumulatorFunctionImpl, creator: AggregateFunctionCreatorRef, ) -> DfAccumulatorFunctionImplementation { - Arc::new(move || { + Arc::new(move |_| { let accumulator = accumulator()?; let creator = creator.clone(); Ok(Box::new(DfAccumulatorAdaptor::new(accumulator, creator))) diff --git a/src/common/query/src/physical_plan.rs b/src/common/query/src/physical_plan.rs index fae0443897..42bb70087e 100644 --- a/src/common/query/src/physical_plan.rs +++ b/src/common/query/src/physical_plan.rs @@ -16,12 +16,11 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -use async_trait::async_trait; -use common_recordbatch::adapter::{AsyncRecordBatchStreamAdapter, DfRecordBatchStreamAdapter}; +use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchStreamAdapter}; use common_recordbatch::{DfSendableRecordBatchStream, SendableRecordBatchStream}; use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef; use datafusion::error::Result as DfResult; -pub use datafusion::execution::runtime_env::RuntimeEnv; +pub use datafusion::execution::context::{SessionContext, TaskContext}; use datafusion::physical_plan::expressions::PhysicalSortExpr; pub use datafusion::physical_plan::Partitioning; use datafusion::physical_plan::Statistics; @@ -63,7 +62,7 @@ pub trait PhysicalPlan: Debug + Send + Sync { fn execute( &self, partition: usize, - runtime: Arc, + context: Arc, ) -> Result; } @@ -111,6 +110,7 @@ impl PhysicalPlan for PhysicalPlanAdapter { .collect(); let plan = self .df_plan + .clone() .with_new_children(children) .context(error::GeneralDataFusionSnafu)?; Ok(Arc::new(PhysicalPlanAdapter::new(self.schema(), plan))) @@ -119,20 +119,22 @@ impl PhysicalPlan for PhysicalPlanAdapter { fn execute( &self, partition: usize, - runtime: Arc, + context: Arc, ) -> Result { let df_plan = self.df_plan.clone(); - let stream = Box::pin(async move { df_plan.execute(partition, runtime).await }); - let stream = AsyncRecordBatchStreamAdapter::new(self.schema(), stream); + let stream = df_plan + .execute(partition, context) + .context(error::GeneralDataFusionSnafu)?; + let adapter = RecordBatchStreamAdapter::try_new(stream) + .context(error::ConvertDfRecordBatchStreamSnafu)?; - Ok(Box::pin(stream)) + Ok(Box::pin(adapter)) } } #[derive(Debug)] pub struct DfPhysicalPlanAdapter(pub PhysicalPlanRef); -#[async_trait] impl DfPhysicalPlan for DfPhysicalPlanAdapter { fn as_any(&self) -> &dyn Any { self @@ -159,15 +161,14 @@ impl DfPhysicalPlan for DfPhysicalPlanAdapter { } fn with_new_children( - &self, + self: Arc, children: Vec>, ) -> DfResult> { let df_schema = self.schema(); let schema: SchemaRef = Arc::new( df_schema .try_into() - .context(error::ConvertArrowSchemaSnafu) - .map_err(error::Error::from)?, + .context(error::ConvertArrowSchemaSnafu)?, ); let children = children .into_iter() @@ -177,12 +178,12 @@ impl DfPhysicalPlan for DfPhysicalPlanAdapter { Ok(Arc::new(DfPhysicalPlanAdapter(plan))) } - async fn execute( + fn execute( &self, partition: usize, - runtime: Arc, + context: Arc, ) -> DfResult { - let stream = self.0.execute(partition, runtime)?; + let stream = self.0.execute(partition, context)?; Ok(Box::pin(DfRecordBatchStreamAdapter::new(stream))) } @@ -194,16 +195,16 @@ impl DfPhysicalPlan for DfPhysicalPlanAdapter { #[cfg(test)] mod test { + use async_trait::async_trait; use common_recordbatch::{RecordBatch, RecordBatches}; - use datafusion::arrow_print; - use datafusion::datasource::TableProvider as DfTableProvider; - use datafusion::logical_plan::LogicalPlanBuilder; + use datafusion::datasource::{DefaultTableSource, TableProvider as DfTableProvider, TableType}; + use datafusion::execution::context::{SessionContext, SessionState}; use datafusion::physical_plan::collect; use datafusion::physical_plan::empty::EmptyExec; - use datafusion::prelude::ExecutionContext; - use datafusion_common::field_util::SchemaExt; - use datafusion_expr::Expr; + use datafusion_expr::logical_plan::builder::LogicalPlanBuilder; + use datafusion_expr::{Expr, TableSource}; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use datatypes::arrow::util::pretty; use datatypes::schema::Schema; use datatypes::vectors::Int32Vector; @@ -225,8 +226,13 @@ mod test { )])) } + fn table_type(&self) -> TableType { + TableType::Base + } + async fn scan( &self, + _ctx: &SessionState, _projection: &Option>, _filters: &[Expr], _limit: Option, @@ -240,6 +246,14 @@ mod test { } } + impl MyDfTableProvider { + fn table_source() -> Arc { + Arc::new(DefaultTableSource { + table_provider: Arc::new(Self), + }) + } + } + #[derive(Debug)] struct MyExecutionPlan { schema: SchemaRef, @@ -269,7 +283,7 @@ mod test { fn execute( &self, _partition: usize, - _runtime: Arc, + _context: Arc, ) -> Result { let schema = self.schema(); let recordbatches = RecordBatches::try_new( @@ -295,20 +309,26 @@ mod test { // Test our physical plan can be executed by DataFusion, through adapters. #[tokio::test] async fn test_execute_physical_plan() { - let ctx = ExecutionContext::new(); - let logical_plan = LogicalPlanBuilder::scan("test", Arc::new(MyDfTableProvider), None) - .unwrap() - .build() - .unwrap(); + let ctx = SessionContext::new(); + let logical_plan = + LogicalPlanBuilder::scan("test", MyDfTableProvider::table_source(), None) + .unwrap() + .build() + .unwrap(); let physical_plan = ctx.create_physical_plan(&logical_plan).await.unwrap(); - let df_recordbatches = collect(physical_plan, Arc::new(RuntimeEnv::default())) + let df_recordbatches = collect(physical_plan, Arc::new(TaskContext::from(&ctx))) .await .unwrap(); - let pretty_print = arrow_print::write(&df_recordbatches); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = pretty::pretty_format_batches(&df_recordbatches).unwrap(); assert_eq!( - pretty_print, - vec!["+---+", "| a |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+",] + pretty_print.to_string(), + r#"+---+ +| a | ++---+ +| 1 | +| 2 | +| 3 | ++---+"# ); } diff --git a/src/common/query/src/signature.rs b/src/common/query/src/signature.rs index c8d4963b6e..1d57ee7992 100644 --- a/src/common/query/src/signature.rs +++ b/src/common/query/src/signature.rs @@ -15,7 +15,7 @@ //! Signature module contains foundational types that are used to represent signatures, types, //! and return types of functions. //! Copied and modified from datafusion. -pub use datafusion::physical_plan::functions::Volatility; +pub use datafusion_expr::Volatility; use datafusion_expr::{Signature as DfSignature, TypeSignature as DfTypeSignature}; use datatypes::arrow::datatypes::DataType as ArrowDataType; use datatypes::data_type::DataType; diff --git a/src/common/recordbatch/Cargo.toml b/src/common/recordbatch/Cargo.toml index a823612061..634ec64410 100644 --- a/src/common/recordbatch/Cargo.toml +++ b/src/common/recordbatch/Cargo.toml @@ -6,10 +6,8 @@ license = "Apache-2.0" [dependencies] common-error = { path = "../error" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" datatypes = { path = "../../datatypes" } futures = "0.3" paste = "1.0" diff --git a/src/common/recordbatch/src/adapter.rs b/src/common/recordbatch/src/adapter.rs index 2994d8f078..2b8436ec4e 100644 --- a/src/common/recordbatch/src/adapter.rs +++ b/src/common/recordbatch/src/adapter.rs @@ -19,7 +19,6 @@ use std::task::{Context, Poll}; use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef; use datafusion::physical_plan::RecordBatchStream as DfRecordBatchStream; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; use datafusion_common::DataFusionError; use datatypes::arrow::error::{ArrowError, Result as ArrowResult}; use datatypes::schema::{Schema, SchemaRef}; @@ -28,7 +27,8 @@ use snafu::ResultExt; use crate::error::{self, Result}; use crate::{ - DfSendableRecordBatchStream, RecordBatch, RecordBatchStream, SendableRecordBatchStream, Stream, + DfRecordBatch, DfSendableRecordBatchStream, RecordBatch, RecordBatchStream, + SendableRecordBatchStream, Stream, }; type FutureStream = Pin< @@ -63,8 +63,8 @@ impl Stream for DfRecordBatchStreamAdapter { match Pin::new(&mut self.stream).poll_next(cx) { Poll::Pending => Poll::Pending, Poll::Ready(Some(recordbatch)) => match recordbatch { - Ok(recordbatch) => Poll::Ready(Some(Ok(recordbatch.df_recordbatch))), - Err(e) => Poll::Ready(Some(Err(ArrowError::External("".to_owned(), Box::new(e))))), + Ok(recordbatch) => Poll::Ready(Some(Ok(recordbatch.into_df_record_batch()))), + Err(e) => Poll::Ready(Some(Err(ArrowError::ExternalError(Box::new(e))))), }, Poll::Ready(None) => Poll::Ready(None), } @@ -102,10 +102,13 @@ impl Stream for RecordBatchStreamAdapter { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.stream).poll_next(cx) { Poll::Pending => Poll::Pending, - Poll::Ready(Some(df_recordbatch)) => Poll::Ready(Some(Ok(RecordBatch { - schema: self.schema(), - df_recordbatch: df_recordbatch.context(error::PollStreamSnafu)?, - }))), + Poll::Ready(Some(df_record_batch)) => { + let df_record_batch = df_record_batch.context(error::PollStreamSnafu)?; + Poll::Ready(Some(RecordBatch::try_from_df_record_batch( + self.schema(), + df_record_batch, + ))) + } Poll::Ready(None) => Poll::Ready(None), } } @@ -157,10 +160,8 @@ impl Stream for AsyncRecordBatchStreamAdapter { AsyncRecordBatchStreamAdapterState::Inited(stream) => match stream { Ok(stream) => { return Poll::Ready(ready!(Pin::new(stream).poll_next(cx)).map(|df| { - Ok(RecordBatch { - schema: self.schema(), - df_recordbatch: df.context(error::PollStreamSnafu)?, - }) + let df_record_batch = df.context(error::PollStreamSnafu)?; + RecordBatch::try_from_df_record_batch(self.schema(), df_record_batch) })); } Err(e) => { @@ -168,8 +169,7 @@ impl Stream for AsyncRecordBatchStreamAdapter { error::CreateRecordBatchesSnafu { reason: format!("Read error {:?} from stream", e), } - .fail() - .map_err(|e| e.into()), + .fail(), )) } }, diff --git a/src/common/recordbatch/src/error.rs b/src/common/recordbatch/src/error.rs index 2425defad8..0937441338 100644 --- a/src/common/recordbatch/src/error.rs +++ b/src/common/recordbatch/src/error.rs @@ -17,13 +17,12 @@ use std::any::Any; use common_error::ext::BoxedError; use common_error::prelude::*; -common_error::define_opaque_error!(Error); pub type Result = std::result::Result; #[derive(Debug, Snafu)] #[snafu(visibility(pub))] -pub enum InnerError { +pub enum Error { #[snafu(display("Fail to create datafusion record batch, source: {}", source))] NewDfRecordBatch { source: datatypes::arrow::error::ArrowError, @@ -59,20 +58,27 @@ pub enum InnerError { source: datatypes::arrow::error::ArrowError, backtrace: Backtrace, }, + + #[snafu(display("Fail to format record batch, source: {}", source))] + Format { + source: datatypes::arrow::error::ArrowError, + backtrace: Backtrace, + }, } -impl ErrorExt for InnerError { +impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - InnerError::NewDfRecordBatch { .. } => StatusCode::InvalidArguments, + Error::NewDfRecordBatch { .. } => StatusCode::InvalidArguments, - InnerError::DataTypes { .. } - | InnerError::CreateRecordBatches { .. } - | InnerError::PollStream { .. } => StatusCode::Internal, + Error::DataTypes { .. } + | Error::CreateRecordBatches { .. } + | Error::PollStream { .. } + | Error::Format { .. } => StatusCode::Internal, - InnerError::External { source } => source.status_code(), + Error::External { source } => source.status_code(), - InnerError::SchemaConversion { source, .. } => source.status_code(), + Error::SchemaConversion { source, .. } => source.status_code(), } } @@ -84,9 +90,3 @@ impl ErrorExt for InnerError { self } } - -impl From for Error { - fn from(e: InnerError) -> Error { - Error::new(e) - } -} diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs index 2809040326..be96a94a50 100644 --- a/src/common/recordbatch/src/lib.rs +++ b/src/common/recordbatch/src/lib.rs @@ -20,16 +20,17 @@ pub mod util; use std::pin::Pin; use std::sync::Arc; -use datafusion::arrow_print; use datafusion::physical_plan::memory::MemoryStream; pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream; +pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch; +use datatypes::arrow::util::pretty; use datatypes::prelude::VectorRef; use datatypes::schema::{Schema, SchemaRef}; use error::Result; use futures::task::{Context, Poll}; use futures::{Stream, TryStreamExt}; pub use recordbatch::RecordBatch; -use snafu::ensure; +use snafu::{ensure, ResultExt}; pub trait RecordBatchStream: Stream> { fn schema(&self) -> SchemaRef; @@ -65,7 +66,7 @@ impl Stream for EmptyRecordBatchStream { } } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub struct RecordBatches { schema: SchemaRef, batches: Vec, @@ -98,17 +99,18 @@ impl RecordBatches { self.batches.iter() } - pub fn pretty_print(&self) -> String { - arrow_print::write( - &self - .iter() - .map(|x| x.df_recordbatch.clone()) - .collect::>(), - ) + pub fn pretty_print(&self) -> Result { + let df_batches = &self + .iter() + .map(|x| x.df_record_batch().clone()) + .collect::>(); + let result = pretty::pretty_format_batches(df_batches).context(error::FormatSnafu)?; + + Ok(result.to_string()) } pub fn try_new(schema: SchemaRef, batches: Vec) -> Result { - for batch in batches.iter() { + for batch in &batches { ensure!( batch.schema == schema, error::CreateRecordBatchesSnafu { @@ -144,7 +146,7 @@ impl RecordBatches { let df_record_batches = self .batches .into_iter() - .map(|batch| batch.df_recordbatch) + .map(|batch| batch.into_df_record_batch()) .collect(); // unwrap safety: `MemoryStream::try_new` won't fail Box::pin( @@ -242,7 +244,7 @@ mod tests { | 1 | hello | | 2 | world | +---+-------+"; - assert_eq!(batches.pretty_print(), expected); + assert_eq!(batches.pretty_print().unwrap(), expected); assert_eq!(schema1, batches.schema()); assert_eq!(vec![batch1], batches.take()); diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs index 5fc886f8b9..6b24a9c5a9 100644 --- a/src/common/recordbatch/src/recordbatch.rs +++ b/src/common/recordbatch/src/recordbatch.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow_array::arrow_array_get; use datatypes::schema::SchemaRef; use datatypes::value::Value; use datatypes::vectors::{Helper, VectorRef}; @@ -22,32 +20,88 @@ use serde::{Serialize, Serializer}; use snafu::ResultExt; use crate::error::{self, Result}; +use crate::DfRecordBatch; -// TODO(yingwen): We should hold vectors in the RecordBatch. +/// A two-dimensional batch of column-oriented data with a defined schema. #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { pub schema: SchemaRef, - pub df_recordbatch: DfRecordBatch, + columns: Vec, + df_record_batch: DfRecordBatch, } impl RecordBatch { + /// Create a new [`RecordBatch`] from `schema` and `columns`. pub fn new>( schema: SchemaRef, columns: I, ) -> Result { - let arrow_arrays = columns.into_iter().map(|v| v.to_arrow_array()).collect(); + let columns: Vec<_> = columns.into_iter().collect(); + let arrow_arrays = columns.iter().map(|v| v.to_arrow_array()).collect(); - let df_recordbatch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays) + let df_record_batch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays) .context(error::NewDfRecordBatchSnafu)?; Ok(RecordBatch { schema, - df_recordbatch, + columns, + df_record_batch, }) } + /// Create a new [`RecordBatch`] from `schema` and `df_record_batch`. + /// + /// This method doesn't check the schema. + pub fn try_from_df_record_batch( + schema: SchemaRef, + df_record_batch: DfRecordBatch, + ) -> Result { + let columns = df_record_batch + .columns() + .iter() + .map(|c| Helper::try_into_vector(c.clone()).context(error::DataTypesSnafu)) + .collect::>>()?; + + Ok(RecordBatch { + schema, + columns, + df_record_batch, + }) + } + + #[inline] + pub fn df_record_batch(&self) -> &DfRecordBatch { + &self.df_record_batch + } + + #[inline] + pub fn into_df_record_batch(self) -> DfRecordBatch { + self.df_record_batch + } + + #[inline] + pub fn columns(&self) -> &[VectorRef] { + &self.columns + } + + #[inline] + pub fn column(&self, idx: usize) -> &VectorRef { + &self.columns[idx] + } + + pub fn column_by_name(&self, name: &str) -> Option<&VectorRef> { + let idx = self.schema.column_index_by_name(name)?; + Some(&self.columns[idx]) + } + + #[inline] + pub fn num_columns(&self) -> usize { + self.columns.len() + } + + #[inline] pub fn num_rows(&self) -> usize { - self.df_recordbatch.num_rows() + self.df_record_batch.num_rows() } /// Create an iterator to traverse the data by row @@ -61,14 +115,15 @@ impl Serialize for RecordBatch { where S: Serializer, { + // TODO(yingwen): arrow and arrow2's schemas have different fields, so + // it might be better to use our `RawSchema` as serialized field. let mut s = serializer.serialize_struct("record", 2)?; - s.serialize_field("schema", &self.schema.arrow_schema())?; + s.serialize_field("schema", &**self.schema.arrow_schema())?; - let df_columns = self.df_recordbatch.columns(); - - let vec = df_columns + let vec = self + .columns .iter() - .map(|c| Helper::try_into_vector(c.clone())?.serialize_to_json()) + .map(|c| c.serialize_to_json()) .collect::, _>>() .map_err(S::Error::custom)?; @@ -88,8 +143,8 @@ impl<'a> RecordBatchRowIterator<'a> { fn new(record_batch: &'a RecordBatch) -> RecordBatchRowIterator { RecordBatchRowIterator { record_batch, - rows: record_batch.df_recordbatch.num_rows(), - columns: record_batch.df_recordbatch.num_columns(), + rows: record_batch.df_record_batch.num_rows(), + columns: record_batch.df_record_batch.num_columns(), row_cursor: 0, } } @@ -104,15 +159,9 @@ impl<'a> Iterator for RecordBatchRowIterator<'a> { } else { let mut row = Vec::with_capacity(self.columns); - // TODO(yingwen): Get from the vector if RecordBatch also holds vectors. for col in 0..self.columns { - let column_array = self.record_batch.df_recordbatch.column(col); - match arrow_array_get(column_array.as_ref(), self.row_cursor) - .context(error::DataTypesSnafu) - { - Ok(field) => row.push(field), - Err(e) => return Some(Err(e.into())), - } + let column = self.record_batch.column(col); + row.push(column.get(self.row_cursor)); } self.row_cursor += 1; @@ -125,63 +174,60 @@ impl<'a> Iterator for RecordBatchRowIterator<'a> { mod tests { use std::sync::Arc; - use datafusion_common::field_util::SchemaExt; - use datafusion_common::record_batch::RecordBatch as DfRecordBatch; - use datatypes::arrow::array::UInt32Array; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use datatypes::prelude::*; + use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; - use datatypes::vectors::{StringVector, UInt32Vector, Vector}; + use datatypes::vectors::{StringVector, UInt32Vector}; use super::*; #[test] - fn test_new_record_batch() { + fn test_record_batch() { let arrow_schema = Arc::new(ArrowSchema::new(vec![ Field::new("c1", DataType::UInt32, false), Field::new("c2", DataType::UInt32, false), ])); let schema = Arc::new(Schema::try_from(arrow_schema).unwrap()); - let v = Arc::new(UInt32Vector::from_slice(&[1, 2, 3])); - let columns: Vec = vec![v.clone(), v.clone()]; + let c1 = Arc::new(UInt32Vector::from_slice(&[1, 2, 3])); + let c2 = Arc::new(UInt32Vector::from_slice(&[4, 5, 6])); + let columns: Vec = vec![c1, c2]; - let batch = RecordBatch::new(schema.clone(), columns).unwrap(); - let expect = v.to_arrow_array(); - for column in batch.df_recordbatch.columns() { - let array = column.as_any().downcast_ref::().unwrap(); - assert_eq!( - expect.as_any().downcast_ref::().unwrap(), - array - ); + let batch = RecordBatch::new(schema.clone(), columns.clone()).unwrap(); + assert_eq!(3, batch.num_rows()); + assert_eq!(&columns, batch.columns()); + for (i, expect) in columns.iter().enumerate().take(batch.num_columns()) { + let column = batch.column(i); + assert_eq!(expect, column); } assert_eq!(schema, batch.schema); + + assert_eq!(columns[0], *batch.column_by_name("c1").unwrap()); + assert_eq!(columns[1], *batch.column_by_name("c2").unwrap()); + assert!(batch.column_by_name("c3").is_none()); + + let converted = + RecordBatch::try_from_df_record_batch(schema, batch.df_record_batch().clone()).unwrap(); + assert_eq!(batch, converted); + assert_eq!(*batch.df_record_batch(), converted.into_df_record_batch()); } #[test] pub fn test_serialize_recordbatch() { - let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + let column_schemas = vec![ColumnSchema::new( "number", - DataType::UInt32, + ConcreteDataType::uint32_datatype(), false, - )])); - let schema = Arc::new(Schema::try_from(arrow_schema.clone()).unwrap()); + )]; + let schema = Arc::new(Schema::try_new(column_schemas).unwrap()); let numbers: Vec = (0..10).collect(); - let df_batch = DfRecordBatch::try_new( - arrow_schema, - vec![Arc::new(UInt32Array::from_slice(&numbers))], - ) - .unwrap(); - - let batch = RecordBatch { - schema, - df_recordbatch: df_batch, - }; + let columns = vec![Arc::new(UInt32Vector::from_slice(&numbers)) as VectorRef]; + let batch = RecordBatch::new(schema, columns).unwrap(); let output = serde_json::to_string(&batch).unwrap(); assert_eq!( - r#"{"schema":{"fields":[{"name":"number","data_type":"UInt32","is_nullable":false,"metadata":{}}],"metadata":{}},"columns":[[0,1,2,3,4,5,6,7,8,9]]}"#, + r#"{"schema":{"fields":[{"name":"number","data_type":"UInt32","nullable":false,"dict_id":0,"dict_is_ordered":false}],"metadata":{"greptime:version":"0"}},"columns":[[0,1,2,3,4,5,6,7,8,9]]}"#, output ); } diff --git a/src/common/recordbatch/src/util.rs b/src/common/recordbatch/src/util.rs index efe34dbfed..4b2f1a67c8 100644 --- a/src/common/recordbatch/src/util.rs +++ b/src/common/recordbatch/src/util.rs @@ -15,23 +15,29 @@ use futures::TryStreamExt; use crate::error::Result; -use crate::{RecordBatch, SendableRecordBatchStream}; +use crate::{RecordBatch, RecordBatches, SendableRecordBatchStream}; +/// Collect all the items from the stream into a vector of [`RecordBatch`]. pub async fn collect(stream: SendableRecordBatchStream) -> Result> { stream.try_collect::>().await } +/// Collect all the items from the stream into [RecordBatches]. +pub async fn collect_batches(stream: SendableRecordBatchStream) -> Result { + let schema = stream.schema(); + let batches = stream.try_collect::>().await?; + RecordBatches::try_new(schema, batches) +} + #[cfg(test)] mod tests { use std::mem; use std::pin::Pin; use std::sync::Arc; - use datafusion_common::field_util::SchemaExt; - use datafusion_common::record_batch::RecordBatch as DfRecordBatch; - use datatypes::arrow::array::UInt32Array; - use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use datatypes::schema::{Schema, SchemaRef}; + use datatypes::prelude::*; + use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; + use datatypes::vectors::UInt32Vector; use futures::task::{Context, Poll}; use futures::Stream; @@ -65,12 +71,13 @@ mod tests { #[tokio::test] async fn test_collect() { - let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + let column_schemas = vec![ColumnSchema::new( "number", - DataType::UInt32, + ConcreteDataType::uint32_datatype(), false, - )])); - let schema = Arc::new(Schema::try_from(arrow_schema.clone()).unwrap()); + )]; + + let schema = Arc::new(Schema::try_new(column_schemas).unwrap()); let stream = MockRecordBatchStream { schema: schema.clone(), @@ -81,24 +88,23 @@ mod tests { assert_eq!(0, batches.len()); let numbers: Vec = (0..10).collect(); - let df_batch = DfRecordBatch::try_new( - arrow_schema.clone(), - vec![Arc::new(UInt32Array::from_slice(&numbers))], - ) - .unwrap(); - - let batch = RecordBatch { - schema: schema.clone(), - df_recordbatch: df_batch, - }; + let columns = [Arc::new(UInt32Vector::from_vec(numbers)) as _]; + let batch = RecordBatch::new(schema.clone(), columns).unwrap(); let stream = MockRecordBatchStream { - schema: Arc::new(Schema::try_from(arrow_schema).unwrap()), + schema: schema.clone(), batch: Some(batch.clone()), }; let batches = collect(Box::pin(stream)).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batch, batches[0]); + + let stream = MockRecordBatchStream { + schema: schema.clone(), + batch: Some(batch.clone()), + }; + let batches = collect_batches(Box::pin(stream)).await.unwrap(); + let expect_batches = RecordBatches::try_new(schema.clone(), vec![batch]).unwrap(); + assert_eq!(expect_batches, batches); } } diff --git a/src/common/substrait/Cargo.toml b/src/common/substrait/Cargo.toml index 9f9aea0b5e..815a986d1e 100644 --- a/src/common/substrait/Cargo.toml +++ b/src/common/substrait/Cargo.toml @@ -10,10 +10,8 @@ catalog = { path = "../../catalog" } common-catalog = { path = "../catalog" } common-error = { path = "../error" } common-telemetry = { path = "../telemetry" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-expr = "14.0.0" datatypes = { path = "../../datatypes" } futures = "0.3" prost = "0.9" diff --git a/src/common/substrait/src/context.rs b/src/common/substrait/src/context.rs index b017e9cc9a..af4a07b788 100644 --- a/src/common/substrait/src/context.rs +++ b/src/common/substrait/src/context.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; -use datafusion::logical_plan::DFSchemaRef; +use datafusion::common::DFSchemaRef; use substrait_proto::protobuf::extensions::simple_extension_declaration::{ ExtensionFunction, MappingType, }; diff --git a/src/common/substrait/src/df_expr.rs b/src/common/substrait/src/df_expr.rs index d924e7b085..b8d77a113c 100644 --- a/src/common/substrait/src/df_expr.rs +++ b/src/common/substrait/src/df_expr.rs @@ -15,8 +15,8 @@ use std::collections::VecDeque; use std::str::FromStr; -use datafusion::logical_plan::{Column, Expr}; -use datafusion_expr::{expr_fn, lit, BuiltinScalarFunction, Operator}; +use datafusion::common::Column; +use datafusion_expr::{expr_fn, lit, Between, BinaryExpr, BuiltinScalarFunction, Expr, Operator}; use datatypes::schema::Schema; use snafu::{ensure, OptionExt}; use substrait_proto::protobuf::expression::field_reference::ReferenceType as FieldReferenceType; @@ -311,21 +311,21 @@ pub fn convert_scalar_function( // skip GetIndexedField, unimplemented. "between" => { ensure_arg_len(3)?; - Expr::Between { + Expr::Between(Between { expr: Box::new(inputs.pop_front().unwrap()), negated: false, low: Box::new(inputs.pop_front().unwrap()), high: Box::new(inputs.pop_front().unwrap()), - } + }) } "not_between" => { ensure_arg_len(3)?; - Expr::Between { + Expr::Between(Between { expr: Box::new(inputs.pop_front().unwrap()), negated: true, low: Box::new(inputs.pop_front().unwrap()), high: Box::new(inputs.pop_front().unwrap()), - } + }) } // skip Case, is covered in substrait::SwitchExpression. // skip Cast and TryCast, is covered in substrait::Cast. @@ -477,7 +477,7 @@ pub fn expression_from_df_expr( rex_type: Some(RexType::Literal(l)), } } - Expr::BinaryExpr { left, op, right } => { + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { let left = expression_from_df_expr(ctx, left, schema)?; let right = expression_from_df_expr(ctx, right, schema)?; let arguments = utils::expression_to_argument(vec![left, right]); @@ -518,12 +518,12 @@ pub fn expression_from_df_expr( name: expr.to_string(), } .fail()?, - Expr::Between { + Expr::Between(Between { expr, negated, low, high, - } => { + }) => { let expr = expression_from_df_expr(ctx, expr, schema)?; let low = expression_from_df_expr(ctx, low, schema)?; let high = expression_from_df_expr(ctx, high, schema)?; @@ -564,7 +564,21 @@ pub fn expression_from_df_expr( | Expr::WindowFunction { .. } | Expr::AggregateUDF { .. } | Expr::InList { .. } - | Expr::Wildcard => UnsupportedExprSnafu { + | Expr::Wildcard + | Expr::Like(_) + | Expr::ILike(_) + | Expr::SimilarTo(_) + | Expr::IsTrue(_) + | Expr::IsFalse(_) + | Expr::IsUnknown(_) + | Expr::IsNotTrue(_) + | Expr::IsNotFalse(_) + | Expr::IsNotUnknown(_) + | Expr::Exists { .. } + | Expr::InSubquery { .. } + | Expr::ScalarSubquery(..) + | Expr::QualifiedWildcard { .. } => todo!(), + Expr::GroupingSet(_) => UnsupportedExprSnafu { name: expr.to_string(), } .fail()?, @@ -628,6 +642,10 @@ mod utils { Operator::RegexNotIMatch => "regex_not_i_match", Operator::BitwiseAnd => "bitwise_and", Operator::BitwiseOr => "bitwise_or", + Operator::BitwiseXor => "bitwise_xor", + Operator::BitwiseShiftRight => "bitwise_shift_right", + Operator::BitwiseShiftLeft => "bitwise_shift_left", + Operator::StringConcat => "string_concat", } } @@ -679,7 +697,6 @@ mod utils { BuiltinScalarFunction::Sqrt => "sqrt", BuiltinScalarFunction::Tan => "tan", BuiltinScalarFunction::Trunc => "trunc", - BuiltinScalarFunction::Array => "make_array", BuiltinScalarFunction::Ascii => "ascii", BuiltinScalarFunction::BitLength => "bit_length", BuiltinScalarFunction::Btrim => "btrim", @@ -723,6 +740,17 @@ mod utils { BuiltinScalarFunction::Trim => "trim", BuiltinScalarFunction::Upper => "upper", BuiltinScalarFunction::RegexpMatch => "regexp_match", + BuiltinScalarFunction::Atan2 => "atan2", + BuiltinScalarFunction::Coalesce => "coalesce", + BuiltinScalarFunction::Power => "power", + BuiltinScalarFunction::MakeArray => "make_array", + BuiltinScalarFunction::DateBin => "date_bin", + BuiltinScalarFunction::FromUnixtime => "from_unixtime", + BuiltinScalarFunction::CurrentDate => "current_date", + BuiltinScalarFunction::CurrentTime => "current_time", + BuiltinScalarFunction::Uuid => "uuid", + BuiltinScalarFunction::Struct => "struct", + BuiltinScalarFunction::ArrowTypeof => "arrow_type_of", } } } diff --git a/src/common/substrait/src/df_logical.rs b/src/common/substrait/src/df_logical.rs index 81909cf38d..a6a81fb6f5 100644 --- a/src/common/substrait/src/df_logical.rs +++ b/src/common/substrait/src/df_logical.rs @@ -19,10 +19,10 @@ use catalog::CatalogManagerRef; use common_error::prelude::BoxedError; use common_telemetry::debug; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; -use datafusion::datasource::TableProvider; -use datafusion::logical_plan::plan::Filter; -use datafusion::logical_plan::{LogicalPlan, TableScan, ToDFSchema}; +use datafusion::common::ToDFSchema; +use datafusion::datasource::DefaultTableSource; use datafusion::physical_plan::project_schema; +use datafusion_expr::{Filter, LogicalPlan, TableScan, TableSource}; use prost::Message; use snafu::{ensure, OptionExt, ResultExt}; use substrait_proto::protobuf::expression::mask_expression::{StructItem, StructSelect}; @@ -144,7 +144,7 @@ impl DFLogicalSubstraitConvertor { .context(error::ConvertDfSchemaSnafu)?; let predicate = to_df_expr(ctx, *condition, &schema)?; - LogicalPlan::Filter(Filter { predicate, input }) + LogicalPlan::Filter(Filter::try_new(predicate, input).context(DFInternalSnafu)?) } RelType::Fetch(_fetch_rel) => UnsupportedPlanSnafu { name: "Fetch Relation", @@ -238,7 +238,9 @@ impl DFLogicalSubstraitConvertor { .context(TableNotFoundSnafu { name: format!("{}.{}.{}", catalog_name, schema_name, table_name), })?; - let adapter = Arc::new(DfTableProviderAdapter::new(table_ref)); + let adapter = Arc::new(DefaultTableSource::new(Arc::new( + DfTableProviderAdapter::new(table_ref), + ))); // Get schema directly from the table, and compare it with the schema retrieved from substrait proto. let stored_schema = adapter.schema(); @@ -267,14 +269,14 @@ impl DFLogicalSubstraitConvertor { ctx.set_df_schema(projected_schema.clone()); - // TODO(ruihang): Support limit + // TODO(ruihang): Support limit(fetch) Ok(LogicalPlan::TableScan(TableScan { table_name: format!("{}.{}.{}", catalog_name, schema_name, table_name), source: adapter, projection, projected_schema, filters, - limit: None, + fetch: None, })) } @@ -302,7 +304,7 @@ impl DFLogicalSubstraitConvertor { .fail()?, LogicalPlan::Filter(filter) => { let input = Some(Box::new( - self.logical_plan_to_rel(ctx, filter.input.clone())?, + self.logical_plan_to_rel(ctx, filter.input().clone())?, )); let schema = plan @@ -312,7 +314,7 @@ impl DFLogicalSubstraitConvertor { .context(error::ConvertDfSchemaSnafu)?; let condition = Some(Box::new(expression_from_df_expr( ctx, - &filter.predicate, + filter.predicate(), &schema, )?)); @@ -368,7 +370,16 @@ impl DFLogicalSubstraitConvertor { name: "DataFusion Logical Limit", } .fail()?, - LogicalPlan::CreateExternalTable(_) + + LogicalPlan::Subquery(_) + | LogicalPlan::SubqueryAlias(_) + | LogicalPlan::CreateView(_) + | LogicalPlan::CreateCatalogSchema(_) + | LogicalPlan::CreateCatalog(_) + | LogicalPlan::DropView(_) + | LogicalPlan::Distinct(_) + | LogicalPlan::SetVariable(_) + | LogicalPlan::CreateExternalTable(_) | LogicalPlan::CreateMemoryTable(_) | LogicalPlan::DropTable(_) | LogicalPlan::Values(_) @@ -414,6 +425,10 @@ impl DFLogicalSubstraitConvertor { let provider = table_scan .source .as_any() + .downcast_ref::() + .context(UnknownPlanSnafu)? + .table_provider + .as_any() .downcast_ref::() .context(UnknownPlanSnafu)?; let table_info = provider.table().table_info(); @@ -485,7 +500,9 @@ impl DFLogicalSubstraitConvertor { fn same_schema_without_metadata(lhs: &ArrowSchemaRef, rhs: &ArrowSchemaRef) -> bool { lhs.fields.len() == rhs.fields.len() && lhs.fields.iter().zip(rhs.fields.iter()).all(|(x, y)| { - x.name == y.name && x.data_type == y.data_type && x.is_nullable == y.is_nullable + x.name() == y.name() + && x.data_type() == y.data_type() + && x.is_nullable() == y.is_nullable() }) } @@ -494,7 +511,7 @@ mod test { use catalog::local::{LocalCatalogManager, MemoryCatalogProvider, MemorySchemaProvider}; use catalog::{CatalogList, CatalogProvider, RegisterTableRequest}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; - use datafusion::logical_plan::DFSchema; + use datafusion::common::{DFSchema, ToDFSchema}; use datatypes::schema::Schema; use table::requests::CreateTableRequest; use table::test_util::{EmptyTable, MockTableEngine}; @@ -564,7 +581,9 @@ mod test { }) .await .unwrap(); - let adapter = Arc::new(DfTableProviderAdapter::new(table_ref)); + let adapter = Arc::new(DefaultTableSource::new(Arc::new( + DfTableProviderAdapter::new(table_ref), + ))); let projection = vec![1, 3, 5]; let df_schema = adapter.schema().to_dfschema().unwrap(); @@ -584,7 +603,7 @@ mod test { projection: Some(projection), projected_schema, filters: vec![], - limit: None, + fetch: None, }); logical_plan_round_trip(table_scan_plan, catalog_manager).await; diff --git a/src/common/time/src/date.rs b/src/common/time/src/date.rs index 30e4529063..b12eb9f50d 100644 --- a/src/common/time/src/date.rs +++ b/src/common/time/src/date.rs @@ -55,8 +55,11 @@ impl From for Date { impl Display for Date { /// [Date] is formatted according to ISO-8601 standard. fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let abs_date = NaiveDate::from_num_days_from_ce(UNIX_EPOCH_FROM_CE + self.0); - f.write_str(&abs_date.format("%F").to_string()) + if let Some(abs_date) = NaiveDate::from_num_days_from_ce_opt(UNIX_EPOCH_FROM_CE + self.0) { + write!(f, "{}", abs_date.format("%F")) + } else { + write!(f, "Date({})", self.0) + } } } @@ -95,7 +98,7 @@ mod tests { Date::from_str("1969-01-01").unwrap().to_string() ); - let now = Utc::now().date().format("%F").to_string(); + let now = Utc::now().date_naive().format("%F").to_string(); assert_eq!(now, Date::from_str(&now).unwrap().to_string()); } diff --git a/src/common/time/src/datetime.rs b/src/common/time/src/datetime.rs index 4055a07429..73d465babe 100644 --- a/src/common/time/src/datetime.rs +++ b/src/common/time/src/datetime.rs @@ -31,8 +31,11 @@ pub struct DateTime(i64); impl Display for DateTime { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let abs_time = NaiveDateTime::from_timestamp(self.0, 0); - write!(f, "{}", abs_time.format(DATETIME_FORMAT)) + if let Some(abs_time) = NaiveDateTime::from_timestamp_opt(self.0, 0) { + write!(f, "{}", abs_time.format(DATETIME_FORMAT)) + } else { + write!(f, "DateTime({})", self.0) + } } } diff --git a/src/common/time/src/timestamp.rs b/src/common/time/src/timestamp.rs index 5ff20f702b..b3de23d01d 100644 --- a/src/common/time/src/timestamp.rs +++ b/src/common/time/src/timestamp.rs @@ -14,6 +14,7 @@ use core::default::Default; use std::cmp::Ordering; +use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use std::str::FromStr; @@ -34,13 +35,34 @@ impl Timestamp { Self { unit, value } } - pub fn from_millis(value: i64) -> Self { + pub fn new_second(value: i64) -> Self { + Self { + value, + unit: TimeUnit::Second, + } + } + + pub fn new_millisecond(value: i64) -> Self { Self { value, unit: TimeUnit::Millisecond, } } + pub fn new_microsecond(value: i64) -> Self { + Self { + value, + unit: TimeUnit::Microsecond, + } + } + + pub fn new_nanosecond(value: i64) -> Self { + Self { + value, + unit: TimeUnit::Nanosecond, + } + } + pub fn unit(&self) -> TimeUnit { self.unit } @@ -54,6 +76,8 @@ impl Timestamp { self.value * self.unit.factor() / unit.factor() } + /// Format timestamp to ISO8601 string. If the timestamp exceeds what chrono timestamp can + /// represent, this function simply print the timestamp unit and value in plain string. pub fn to_iso8601_string(&self) -> String { let nano_factor = TimeUnit::Second.factor() / TimeUnit::Nanosecond.factor(); @@ -65,8 +89,11 @@ impl Timestamp { nsecs += nano_factor; } - let datetime = Utc.timestamp(secs, nsecs as u32); - format!("{}", datetime.format("%Y-%m-%d %H:%M:%S%.f%z")) + if let LocalResult::Single(datetime) = Utc.timestamp_opt(secs, nsecs as u32) { + format!("{}", datetime.format("%Y-%m-%d %H:%M:%S%.f%z")) + } else { + format!("[Timestamp{}: {}]", self.unit, self.value) + } } } @@ -168,6 +195,25 @@ pub enum TimeUnit { Nanosecond, } +impl Display for TimeUnit { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + TimeUnit::Second => { + write!(f, "Second") + } + TimeUnit::Millisecond => { + write!(f, "Millisecond") + } + TimeUnit::Microsecond => { + write!(f, "Microsecond") + } + TimeUnit::Nanosecond => { + write!(f, "Nanosecond") + } + } + } +} + impl TimeUnit { pub fn factor(&self) -> i64 { match self { @@ -249,10 +295,11 @@ mod tests { // but expected timestamp is in UTC timezone fn check_from_str(s: &str, expect: &str) { let ts = Timestamp::from_str(s).unwrap(); - let time = NaiveDateTime::from_timestamp( + let time = NaiveDateTime::from_timestamp_opt( ts.value / 1_000_000_000, (ts.value % 1_000_000_000) as u32, - ); + ) + .unwrap(); assert_eq!(expect, time.to_string()); } @@ -265,7 +312,13 @@ mod tests { check_from_str( "2020-09-08 13:42:29", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 0, ) .unwrap() @@ -275,7 +328,13 @@ mod tests { check_from_str( "2020-09-08T13:42:29", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 0, ) .unwrap() @@ -285,7 +344,13 @@ mod tests { check_from_str( "2020-09-08 13:42:29.042", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 42000000, ) .unwrap() @@ -296,7 +361,13 @@ mod tests { check_from_str( "2020-09-08T13:42:29.042", &NaiveDateTime::from_timestamp_opt( - 1599572549 - Local.timestamp(0, 0).offset().fix().local_minus_utc() as i64, + 1599572549 + - Local + .timestamp_opt(0, 0) + .unwrap() + .offset() + .fix() + .local_minus_utc() as i64, 42000000, ) .unwrap() @@ -316,19 +387,19 @@ mod tests { assert_eq!(datetime_str, ts.to_iso8601_string()); let ts_millis = 1668070237000; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("2022-11-10 08:50:37+0000", ts.to_iso8601_string()); let ts_millis = -1000; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("1969-12-31 23:59:59+0000", ts.to_iso8601_string()); let ts_millis = -1; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("1969-12-31 23:59:59.999+0000", ts.to_iso8601_string()); let ts_millis = -1001; - let ts = Timestamp::from_millis(ts_millis); + let ts = Timestamp::new_millisecond(ts_millis); assert_eq!("1969-12-31 23:59:58.999+0000", ts.to_iso8601_string()); } diff --git a/src/common/time/src/util.rs b/src/common/time/src/util.rs index 3d3baebc2e..1917ce3456 100644 --- a/src/common/time/src/util.rs +++ b/src/common/time/src/util.rs @@ -33,8 +33,8 @@ mod tests { .duration_since(time::UNIX_EPOCH) .unwrap() .as_millis() as i64; - let datetime_now = chrono::Utc.timestamp_millis(now); - let datetime_std = chrono::Utc.timestamp_millis(millis_from_std); + let datetime_now = chrono::Utc.timestamp_millis_opt(now).unwrap(); + let datetime_std = chrono::Utc.timestamp_millis_opt(millis_from_std).unwrap(); assert_eq!(datetime_std.year(), datetime_now.year()); assert_eq!(datetime_std.month(), datetime_now.month()); diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index 47f34d2186..a245340ad8 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -25,9 +25,7 @@ common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } +datafusion = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" hyper = { version = "0.14", features = ["full"] } @@ -59,8 +57,5 @@ tower-http = { version = "0.3", features = ["full"] } axum-test-helper = { git = "https://github.com/sunng87/axum-test-helper.git", branch = "patch-1" } client = { path = "../client" } common-query = { path = "../common/query" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-common = "14.0.0" tempdir = "0.3" diff --git a/src/datanode/src/server/grpc.rs b/src/datanode/src/server/grpc.rs index 26108eb020..5109522541 100644 --- a/src/datanode/src/server/grpc.rs +++ b/src/datanode/src/server/grpc.rs @@ -260,7 +260,7 @@ mod tests { }, ColumnDef { name: "ts".to_string(), - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, is_nullable: false, default_constraint: None, }, @@ -295,8 +295,12 @@ mod tests { fn expected_table_schema() -> SchemaRef { let column_schemas = vec![ ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), ]; diff --git a/src/datanode/src/sql.rs b/src/datanode/src/sql.rs index 0a3b4a999e..e578bec1e9 100644 --- a/src/datanode/src/sql.rs +++ b/src/datanode/src/sql.rs @@ -154,8 +154,12 @@ mod tests { ColumnSchema::new("host", ConcreteDataType::string_datatype(), false), ColumnSchema::new("cpu", ConcreteDataType::float64_datatype(), true), ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), true) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + true, + ) + .with_time_index(true), ]; Arc::new( @@ -284,11 +288,11 @@ mod tests { let ts = &columns_values["ts"]; assert_eq!(2, ts.len()); assert_eq!( - Value::from(Timestamp::from_millis(1655276557000i64)), + Value::from(Timestamp::new_millisecond(1655276557000i64)), ts.get(0) ); assert_eq!( - Value::from(Timestamp::from_millis(1655276558000i64)), + Value::from(Timestamp::new_millisecond(1655276558000i64)), ts.get(1) ); } diff --git a/src/datanode/src/sql/create.rs b/src/datanode/src/sql/create.rs index 8b75bdef3f..ac80338aa8 100644 --- a/src/datanode/src/sql/create.rs +++ b/src/datanode/src/sql/create.rs @@ -375,7 +375,7 @@ mod tests { .data_type ); assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), request .schema .column_schema_by_name("ts") diff --git a/src/datanode/src/sql/insert.rs b/src/datanode/src/sql/insert.rs index 8c2dae5c4a..6c99b71729 100644 --- a/src/datanode/src/sql/insert.rs +++ b/src/datanode/src/sql/insert.rs @@ -14,7 +14,9 @@ use catalog::CatalogManagerRef; use common_query::Output; -use datatypes::prelude::{ConcreteDataType, VectorBuilder}; +use datatypes::data_type::DataType; +use datatypes::prelude::ConcreteDataType; +use datatypes::vectors::MutableVector; use snafu::{ensure, OptionExt, ResultExt}; use sql::ast::Value as SqlValue; use sql::statements::insert::Insert; @@ -70,7 +72,7 @@ impl SqlHandler { }; let rows_num = values.len(); - let mut columns_builders: Vec<(&String, &ConcreteDataType, VectorBuilder)> = + let mut columns_builders: Vec<(&String, &ConcreteDataType, Box)> = Vec::with_capacity(columns_num); if columns.is_empty() { @@ -79,7 +81,7 @@ impl SqlHandler { columns_builders.push(( &column_schema.name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } else { @@ -95,7 +97,7 @@ impl SqlHandler { columns_builders.push(( column_name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } @@ -123,7 +125,7 @@ impl SqlHandler { table_name: table_ref.table.to_string(), columns_values: columns_builders .into_iter() - .map(|(c, _, mut b)| (c.to_owned(), b.finish())) + .map(|(c, _, mut b)| (c.to_owned(), b.to_vector())) .collect(), })) } @@ -133,11 +135,11 @@ fn add_row_to_vector( column_name: &str, data_type: &ConcreteDataType, sql_val: &SqlValue, - builder: &mut VectorBuilder, + builder: &mut Box, ) -> Result<()> { let value = statements::sql_value_to_value(column_name, data_type, sql_val) .context(ParseSqlValueSnafu)?; - builder.push(&value); + builder.push_value_ref(value.as_value_ref()).unwrap(); Ok(()) } diff --git a/src/datanode/src/tests/instance_test.rs b/src/datanode/src/tests/instance_test.rs index 1b01d05eae..26ba03da73 100644 --- a/src/datanode/src/tests/instance_test.rs +++ b/src/datanode/src/tests/instance_test.rs @@ -17,11 +17,8 @@ use std::sync::Arc; use common_catalog::consts::DEFAULT_SCHEMA_NAME; use common_query::Output; use common_recordbatch::util; -use datafusion::arrow_print; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow::array::{Int64Array, UInt64Array, Utf8Array}; -use datatypes::arrow_array::StringArray; -use datatypes::prelude::ConcreteDataType; +use datatypes::data_type::ConcreteDataType; +use datatypes::vectors::{Int64Vector, StringVector, UInt64Vector, VectorRef}; use session::context::QueryContext; use crate::instance::Instance; @@ -66,11 +63,13 @@ async fn test_create_database_and_insert_query() { match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); + assert_eq!(1, batches[0].num_columns()); assert_eq!( - &Int64Array::from_slice(&[1655276557000, 1655276558000]), - columns[0].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ + 1655276557000_i64, + 1655276558000_i64 + ])) as VectorRef, + *batches[0].column(0) ); } _ => unreachable!(), @@ -155,18 +154,15 @@ async fn assert_query_result(instance: &Instance, sql: &str, ts: i64, host: &str match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(2, columns.len()); + // let columns = batches[0].df_recordbatch.columns(); + assert_eq!(2, batches[0].num_columns()); assert_eq!( - &Utf8Array::::from_slice(&[host]), - columns[0] - .as_any() - .downcast_ref::>() - .unwrap() + Arc::new(StringVector::from(vec![host])) as VectorRef, + *batches[0].column(0) ); assert_eq!( - &Int64Array::from_slice(&[ts]), - columns[1].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ts])) as VectorRef, + *batches[0].column(1) ); } _ => unreachable!(), @@ -183,7 +179,7 @@ async fn setup_test_instance(test_name: &str) -> Instance { test_util::create_test_table( instance.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); @@ -235,11 +231,13 @@ async fn test_execute_insert_query_with_i64_timestamp() { match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); + assert_eq!(1, batches[0].num_columns()); assert_eq!( - &Int64Array::from_slice(&[1655276557000, 1655276558000]), - columns[0].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ + 1655276557000_i64, + 1655276558000_i64 + ])) as VectorRef, + *batches[0].column(0) ); } _ => unreachable!(), @@ -249,11 +247,13 @@ async fn test_execute_insert_query_with_i64_timestamp() { match query_output { Output::Stream(s) => { let batches = util::collect(s).await.unwrap(); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); + assert_eq!(1, batches[0].num_columns()); assert_eq!( - &Int64Array::from_slice(&[1655276557000, 1655276558000]), - columns[0].as_any().downcast_ref::().unwrap() + Arc::new(Int64Vector::from_vec(vec![ + 1655276557000_i64, + 1655276558000_i64 + ])) as VectorRef, + *batches[0].column(0) ); } _ => unreachable!(), @@ -270,13 +270,12 @@ async fn test_execute_query() { match output { Output::Stream(recordbatch) => { let numbers = util::collect(recordbatch).await.unwrap(); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, numbers[0].num_columns()); + assert_eq!(numbers[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt64Array::from_slice(&[4950]) + Arc::new(UInt64Vector::from_vec(vec![4950_u64])) as VectorRef, + *numbers[0].column(0), ); } _ => unreachable!(), @@ -294,13 +293,12 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - StringArray::from(vec![Some("public")]) + *databases[0].column(0), + Arc::new(StringVector::from(vec![Some("public")])) as VectorRef ); } _ => unreachable!(), @@ -310,13 +308,12 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - StringArray::from(vec![Some("public")]) + *databases[0].column(0), + Arc::new(StringVector::from(vec![Some("public")])) as VectorRef ); } _ => unreachable!(), @@ -326,9 +323,8 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 2); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 2); } _ => unreachable!(), } @@ -337,7 +333,7 @@ async fn test_execute_show_databases_tables() { test_util::create_test_table( instance.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); @@ -346,9 +342,8 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 3); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 3); } _ => unreachable!(), } @@ -358,13 +353,12 @@ async fn test_execute_show_databases_tables() { match output { Output::RecordBatches(databases) => { let databases = databases.take(); - let columns = databases[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + assert_eq!(1, databases[0].num_columns()); + assert_eq!(databases[0].column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - StringArray::from(vec![Some("demo")]) + *databases[0].column(0), + Arc::new(StringVector::from(vec![Some("demo")])) as VectorRef ); } _ => unreachable!(), @@ -394,18 +388,13 @@ pub async fn test_execute_create() { assert!(matches!(output, Output::AffectedRows(1))); } -async fn check_output_stream(output: Output, expected: Vec<&str>) { +async fn check_output_stream(output: Output, expected: String) { let recordbatches = match output { - Output::Stream(stream) => util::collect(stream).await.unwrap(), - Output::RecordBatches(recordbatches) => recordbatches.take(), + Output::Stream(stream) => util::collect_batches(stream).await.unwrap(), + Output::RecordBatches(recordbatches) => recordbatches, _ => unreachable!(), }; - let recordbatches = recordbatches - .into_iter() - .map(|r| r.df_recordbatch) - .collect::>(); - let pretty_print = arrow_print::write(&recordbatches); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = recordbatches.pretty_print().unwrap(); assert_eq!(pretty_print, expected); } @@ -438,15 +427,16 @@ async fn test_alter_table() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql(&instance, "select * from demo order by ts").await; - let expected = vec![ - "+-------+-----+--------+---------------------+--------+", - "| host | cpu | memory | ts | my_tag |", - "+-------+-----+--------+---------------------+--------+", - "| host1 | 1.1 | 100 | 1970-01-01 00:00:01 | |", - "| host2 | 2.2 | 200 | 1970-01-01 00:00:02 | hello |", - "| host3 | 3.3 | 300 | 1970-01-01 00:00:03 | |", - "+-------+-----+--------+---------------------+--------+", - ]; + let expected = "\ ++-------+-----+--------+---------------------+--------+ +| host | cpu | memory | ts | my_tag | ++-------+-----+--------+---------------------+--------+ +| host1 | 1.1 | 100 | 1970-01-01T00:00:01 | | +| host2 | 2.2 | 200 | 1970-01-01T00:00:02 | hello | +| host3 | 3.3 | 300 | 1970-01-01T00:00:03 | | ++-------+-----+--------+---------------------+--------+\ + " + .to_string(); check_output_stream(output, expected).await; // Drop a column @@ -454,15 +444,16 @@ async fn test_alter_table() { assert!(matches!(output, Output::AffectedRows(0))); let output = execute_sql(&instance, "select * from demo order by ts").await; - let expected = vec![ - "+-------+-----+---------------------+--------+", - "| host | cpu | ts | my_tag |", - "+-------+-----+---------------------+--------+", - "| host1 | 1.1 | 1970-01-01 00:00:01 | |", - "| host2 | 2.2 | 1970-01-01 00:00:02 | hello |", - "| host3 | 3.3 | 1970-01-01 00:00:03 | |", - "+-------+-----+---------------------+--------+", - ]; + let expected = "\ ++-------+-----+---------------------+--------+ +| host | cpu | ts | my_tag | ++-------+-----+---------------------+--------+ +| host1 | 1.1 | 1970-01-01T00:00:01 | | +| host2 | 2.2 | 1970-01-01T00:00:02 | hello | +| host3 | 3.3 | 1970-01-01T00:00:03 | | ++-------+-----+---------------------+--------+\ + " + .to_string(); check_output_stream(output, expected).await; // insert a new row @@ -474,16 +465,17 @@ async fn test_alter_table() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql(&instance, "select * from demo order by ts").await; - let expected = vec![ - "+-------+-----+---------------------+--------+", - "| host | cpu | ts | my_tag |", - "+-------+-----+---------------------+--------+", - "| host1 | 1.1 | 1970-01-01 00:00:01 | |", - "| host2 | 2.2 | 1970-01-01 00:00:02 | hello |", - "| host3 | 3.3 | 1970-01-01 00:00:03 | |", - "| host4 | 400 | 1970-01-01 00:00:04 | world |", - "+-------+-----+---------------------+--------+", - ]; + let expected = "\ ++-------+-----+---------------------+--------+ +| host | cpu | ts | my_tag | ++-------+-----+---------------------+--------+ +| host1 | 1.1 | 1970-01-01T00:00:01 | | +| host2 | 2.2 | 1970-01-01T00:00:02 | hello | +| host3 | 3.3 | 1970-01-01T00:00:03 | | +| host4 | 400 | 1970-01-01T00:00:04 | world | ++-------+-----+---------------------+--------+\ + " + .to_string(); check_output_stream(output, expected).await; } @@ -522,14 +514,15 @@ async fn test_insert_with_default_value_for_type(type_name: &str) { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql(&instance, "select host, cpu from test_table").await; - let expected = vec![ - "+-------+-----+", - "| host | cpu |", - "+-------+-----+", - "| host1 | 1.1 |", - "| host2 | 2.2 |", - "+-------+-----+", - ]; + let expected = "\ ++-------+-----+ +| host | cpu | ++-------+-----+ +| host1 | 1.1 | +| host2 | 2.2 | ++-------+-----+\ + " + .to_string(); check_output_stream(output, expected).await; } @@ -559,13 +552,14 @@ async fn test_use_database() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql_in_db(&instance, "show tables", "db1").await; - let expected = vec![ - "+--------+", - "| Tables |", - "+--------+", - "| tb1 |", - "+--------+", - ]; + let expected = "\ ++--------+ +| Tables | ++--------+ +| tb1 | ++--------+\ + " + .to_string(); check_output_stream(output, expected).await; let output = execute_sql_in_db( @@ -577,25 +571,27 @@ async fn test_use_database() { assert!(matches!(output, Output::AffectedRows(1))); let output = execute_sql_in_db(&instance, "select col_i32 from tb1", "db1").await; - let expected = vec![ - "+---------+", - "| col_i32 |", - "+---------+", - "| 1 |", - "+---------+", - ]; + let expected = "\ ++---------+ +| col_i32 | ++---------+ +| 1 | ++---------+\ + " + .to_string(); check_output_stream(output, expected).await; // Making a particular database the default by means of the USE statement does not preclude // accessing tables in other databases. let output = execute_sql(&instance, "select number from public.numbers limit 1").await; - let expected = vec![ - "+--------+", - "| number |", - "+--------+", - "| 0 |", - "+--------+", - ]; + let expected = "\ ++--------+ +| number | ++--------+ +| 0 | ++--------+\ + " + .to_string(); check_output_stream(output, expected).await; } diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index 5c66508dd1..2841decb67 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -9,10 +9,12 @@ default = [] test = [] [dependencies] +arrow = { version = "26.0" } +arrow-schema = { version = "26.0", features = ["serde"] } common-base = { path = "../common/base" } common-error = { path = "../common/error" } common-time = { path = "../common/time" } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-common = "14.0" enum_dispatch = "0.3" num = "0.4" num-traits = "0.2" @@ -21,17 +23,3 @@ paste = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" snafu = { version = "0.7", features = ["backtraces"] } - -[dependencies.arrow] -package = "arrow2" -version = "0.10" -features = [ - "io_csv", - "io_json", - "io_parquet", - "io_parquet_compression", - "io_ipc", - "ahash", - "compute", - "serde_types", -] diff --git a/src/datatypes/src/arrow_array.rs b/src/datatypes/src/arrow_array.rs index ca2cb6cc48..72de422142 100644 --- a/src/datatypes/src/arrow_array.rs +++ b/src/datatypes/src/arrow_array.rs @@ -12,216 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use arrow::array::{ - self, Array, BinaryArray as ArrowBinaryArray, ListArray, - MutableBinaryArray as ArrowMutableBinaryArray, MutableUtf8Array, PrimitiveArray, Utf8Array, -}; -use arrow::datatypes::DataType as ArrowDataType; -use common_time::timestamp::Timestamp; -use snafu::OptionExt; - -use crate::error::{ConversionSnafu, Result}; -use crate::prelude::ConcreteDataType; -use crate::value::{ListValue, Value}; - -pub type BinaryArray = ArrowBinaryArray; -pub type MutableBinaryArray = ArrowMutableBinaryArray; -pub type MutableStringArray = MutableUtf8Array; -pub type StringArray = Utf8Array; - -macro_rules! cast_array { - ($arr: ident, $CastType: ty) => { - $arr.as_any() - .downcast_ref::<$CastType>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", $arr.data_type()), - })? - }; -} - -pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { - if array.is_null(idx) { - return Ok(Value::Null); - } - - let result = match array.data_type() { - ArrowDataType::Null => Value::Null, - ArrowDataType::Boolean => { - Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)) - } - ArrowDataType::Binary | ArrowDataType::LargeBinary => { - Value::Binary(cast_array!(array, BinaryArray).value(idx).into()) - } - ArrowDataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::).value(idx)), - ArrowDataType::UInt16 => { - Value::UInt16(cast_array!(array, PrimitiveArray::).value(idx)) - } - ArrowDataType::UInt32 => { - Value::UInt32(cast_array!(array, PrimitiveArray::).value(idx)) - } - ArrowDataType::UInt64 => { - Value::UInt64(cast_array!(array, PrimitiveArray::).value(idx)) - } - ArrowDataType::Float32 => { - Value::Float32(cast_array!(array, PrimitiveArray::).value(idx).into()) - } - ArrowDataType::Float64 => { - Value::Float64(cast_array!(array, PrimitiveArray::).value(idx).into()) - } - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { - Value::String(cast_array!(array, StringArray).value(idx).into()) - } - ArrowDataType::Timestamp(t, _) => { - let value = cast_array!(array, PrimitiveArray::).value(idx); - let unit = match ConcreteDataType::from_arrow_time_unit(t) { - ConcreteDataType::Timestamp(t) => t.unit, - _ => unreachable!(), - }; - Value::Timestamp(Timestamp::new(value, unit)) - } - ArrowDataType::List(_) => { - let array = cast_array!(array, ListArray::).value(idx); - let inner_datatype = ConcreteDataType::try_from(array.data_type())?; - let values = (0..array.len()) - .map(|i| arrow_array_get(&*array, i)) - .collect::>>()?; - Value::List(ListValue::new(Some(Box::new(values)), inner_datatype)) - } - _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), - }; - - Ok(result) -} - -#[cfg(test)] -mod test { - use arrow::array::{ - BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - MutableListArray, MutablePrimitiveArray, TryExtend, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }; - use arrow::buffer::Buffer; - use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit}; - use common_time::timestamp::{TimeUnit, Timestamp}; - - use super::*; - use crate::prelude::Vector; - use crate::vectors::TimestampVector; - - #[test] - fn test_arrow_array_access() { - let array1 = BooleanArray::from_slice(vec![true, true, false, false]); - assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); - let array = Int64Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); - let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]); - assert_eq!( - Value::Float32(2f32.into()), - arrow_array_get(&array1, 1).unwrap() - ); - let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]); - assert_eq!( - Value::Float64(2f64.into()), - arrow_array_get(&array1, 1).unwrap() - ); - - let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); - assert_eq!( - Value::String("hello".into()), - arrow_array_get(&array2, 0).unwrap() - ); - assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - - let array3 = super::BinaryArray::from(vec![ - Some("hello".as_bytes()), - None, - Some("world".as_bytes()), - ]); - assert_eq!( - Value::Binary("hello".as_bytes().into()), - arrow_array_get(&array3, 0).unwrap() - ); - assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - - let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4])); - let array = vector.to_boxed_arrow_array(); - let value = arrow_array_get(&*array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) - ); - - let array4 = PrimitiveArray::::from_data( - DataType::Timestamp(ArrowTimeUnit::Millisecond, None), - Buffer::from_slice(&vec![1, 2, 3, 4]), - None, - ); - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), - arrow_array_get(&array4, 0).unwrap() - ); - - let array4 = PrimitiveArray::::from_data( - DataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - Buffer::from_slice(&vec![1, 2, 3, 4]), - None, - ); - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), - arrow_array_get(&array4, 0).unwrap() - ); - - // test list array - let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ListArray = arrow_array.into(); - - let v0 = arrow_array_get(&arrow_array, 0).unwrap(); - match v0 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!( - **items, - vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] - ); - } - _ => unreachable!(), - } - - assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); - let v2 = arrow_array_get(&arrow_array, 2).unwrap(); - match v2 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); - } - _ => unreachable!(), - } - } -} +pub type BinaryArray = arrow::array::LargeBinaryArray; +pub type MutableBinaryArray = arrow::array::LargeBinaryBuilder; +pub type StringArray = arrow::array::StringArray; +pub type MutableStringArray = arrow::array::StringBuilder; diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs index e14a3d8e84..9e4641defa 100644 --- a/src/datatypes/src/data_type.rs +++ b/src/datatypes/src/data_type.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; use common_time::timestamp::TimeUnit; use paste::paste; use serde::{Deserialize, Serialize}; @@ -23,13 +23,14 @@ use crate::error::{self, Error, Result}; use crate::type_id::LogicalTypeId; use crate::types::{ BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampType, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use crate::value::Value; use crate::vectors::MutableVector; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[enum_dispatch::enum_dispatch(DataType)] pub enum ConcreteDataType { Null(NullType), @@ -47,17 +48,21 @@ pub enum ConcreteDataType { Float32(Float32Type), Float64(Float64Type), - // String types + // String types: Binary(BinaryType), String(StringType), + // Date types: Date(DateType), DateTime(DateTimeType), Timestamp(TimestampType), + // Compound types: List(ListType), } +// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method +// returning all these properties to the `DataType` trait impl ConcreteDataType { pub fn is_float(&self) -> bool { matches!( @@ -70,7 +75,7 @@ impl ConcreteDataType { matches!(self, ConcreteDataType::Boolean(_)) } - pub fn stringifiable(&self) -> bool { + pub fn is_stringifiable(&self) -> bool { matches!( self, ConcreteDataType::String(_) @@ -103,13 +108,6 @@ impl ConcreteDataType { ) } - pub fn is_timestamp(&self) -> bool { - matches!( - self, - ConcreteDataType::Timestamp(_) | ConcreteDataType::Int64(_) - ) - } - pub fn numerics() -> Vec { vec![ ConcreteDataType::int8_datatype(), @@ -136,6 +134,14 @@ impl ConcreteDataType { pub fn is_null(&self) -> bool { matches!(self, ConcreteDataType::Null(NullType)) } + + /// Try to cast the type as a [`ListType`]. + pub fn as_list(&self) -> Option<&ListType> { + match self { + ConcreteDataType::List(t) => Some(t), + _ => None, + } + } } impl TryFrom<&ArrowDataType> for ConcreteDataType { @@ -161,7 +167,7 @@ impl TryFrom<&ArrowDataType> for ConcreteDataType { ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), ArrowDataType::List(field) => Self::List(ListType::new( - ConcreteDataType::from_arrow_type(&field.data_type), + ConcreteDataType::from_arrow_type(field.data_type()), )), _ => { return error::UnsupportedArrowTypeSnafu { @@ -191,38 +197,52 @@ macro_rules! impl_new_concrete_type_functions { impl_new_concrete_type_functions!( Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, - Binary, String, Date, DateTime + Binary, Date, DateTime, String ); impl ConcreteDataType { - pub fn list_datatype(inner_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(inner_type)) + pub fn timestamp_second_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) + } + + pub fn timestamp_millisecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Millisecond( + TimestampMillisecondType::default(), + )) + } + + pub fn timestamp_microsecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Microsecond( + TimestampMicrosecondType::default(), + )) + } + + pub fn timestamp_nanosecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) } pub fn timestamp_datatype(unit: TimeUnit) -> Self { - ConcreteDataType::Timestamp(TimestampType::new(unit)) - } - - pub fn timestamp_millis_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::new(TimeUnit::Millisecond)) + match unit { + TimeUnit::Second => Self::timestamp_second_datatype(), + TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), + } } /// Converts from arrow timestamp unit to - // TODO(hl): maybe impl From for our timestamp ? - pub fn from_arrow_time_unit(t: &arrow::datatypes::TimeUnit) -> Self { + pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { match t { - arrow::datatypes::TimeUnit::Second => Self::timestamp_datatype(TimeUnit::Second), - arrow::datatypes::TimeUnit::Millisecond => { - Self::timestamp_datatype(TimeUnit::Millisecond) - } - arrow::datatypes::TimeUnit::Microsecond => { - Self::timestamp_datatype(TimeUnit::Microsecond) - } - arrow::datatypes::TimeUnit::Nanosecond => { - Self::timestamp_datatype(TimeUnit::Nanosecond) - } + ArrowTimeUnit::Second => Self::timestamp_second_datatype(), + ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), } } + + pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { + ConcreteDataType::List(ListType::new(item_type)) + } } /// Data type abstraction. @@ -237,11 +257,15 @@ pub trait DataType: std::fmt::Debug + Send + Sync { /// Returns the default value of this type. fn default_value(&self) -> Value; - /// Convert this type as [arrow2::datatypes::DataType]. + /// Convert this type as [arrow::datatypes::DataType]. fn as_arrow_type(&self) -> ArrowDataType; - /// Create a mutable vector with given `capacity` of this type. + /// Creates a mutable vector with given `capacity` of this type. fn create_mutable_vector(&self, capacity: usize) -> Box; + + /// Returns true if the data type is compatible with timestamp type so we can + /// use it as a timestamp. + fn is_timestamp_compatible(&self) -> bool; } pub type DataTypeRef = Arc; @@ -324,10 +348,6 @@ mod tests { ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), ConcreteDataType::String(_) )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), - ConcreteDataType::String(_) - )); assert_eq!( ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( "item", @@ -345,31 +365,48 @@ mod tests { #[test] fn test_from_arrow_timestamp() { assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Millisecond) + ConcreteDataType::timestamp_millisecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Microsecond) + ConcreteDataType::timestamp_microsecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Nanosecond) + ConcreteDataType::timestamp_nanosecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Second), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Second) + ConcreteDataType::timestamp_second_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) ); } #[test] - fn test_is_timestamp() { - assert!(ConcreteDataType::timestamp_millis_datatype().is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp()); - assert!(ConcreteDataType::int64_datatype().is_timestamp()); + fn test_is_timestamp_compatible() { + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() + ); + assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); } #[test] @@ -377,4 +414,91 @@ mod tests { assert!(ConcreteDataType::null_datatype().is_null()); assert!(!ConcreteDataType::int32_datatype().is_null()); } + + #[test] + fn test_is_float() { + assert!(!ConcreteDataType::int32_datatype().is_float()); + assert!(ConcreteDataType::float32_datatype().is_float()); + assert!(ConcreteDataType::float64_datatype().is_float()); + } + + #[test] + fn test_is_boolean() { + assert!(!ConcreteDataType::int32_datatype().is_boolean()); + assert!(!ConcreteDataType::float32_datatype().is_boolean()); + assert!(ConcreteDataType::boolean_datatype().is_boolean()); + } + + #[test] + fn test_is_stringifiable() { + assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); + assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); + assert!(ConcreteDataType::string_datatype().is_stringifiable()); + assert!(ConcreteDataType::date_datatype().is_stringifiable()); + assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); + } + + #[test] + fn test_is_signed() { + assert!(ConcreteDataType::int8_datatype().is_signed()); + assert!(ConcreteDataType::int16_datatype().is_signed()); + assert!(ConcreteDataType::int32_datatype().is_signed()); + assert!(ConcreteDataType::int64_datatype().is_signed()); + assert!(ConcreteDataType::date_datatype().is_signed()); + assert!(ConcreteDataType::datetime_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); + + assert!(!ConcreteDataType::uint8_datatype().is_signed()); + assert!(!ConcreteDataType::uint16_datatype().is_signed()); + assert!(!ConcreteDataType::uint32_datatype().is_signed()); + assert!(!ConcreteDataType::uint64_datatype().is_signed()); + + assert!(!ConcreteDataType::float32_datatype().is_signed()); + assert!(!ConcreteDataType::float64_datatype().is_signed()); + } + + #[test] + fn test_is_unsigned() { + assert!(!ConcreteDataType::int8_datatype().is_unsigned()); + assert!(!ConcreteDataType::int16_datatype().is_unsigned()); + assert!(!ConcreteDataType::int32_datatype().is_unsigned()); + assert!(!ConcreteDataType::int64_datatype().is_unsigned()); + assert!(!ConcreteDataType::date_datatype().is_unsigned()); + assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); + + assert!(ConcreteDataType::uint8_datatype().is_unsigned()); + assert!(ConcreteDataType::uint16_datatype().is_unsigned()); + assert!(ConcreteDataType::uint32_datatype().is_unsigned()); + assert!(ConcreteDataType::uint64_datatype().is_unsigned()); + + assert!(!ConcreteDataType::float32_datatype().is_unsigned()); + assert!(!ConcreteDataType::float64_datatype().is_unsigned()); + } + + #[test] + fn test_numerics() { + let nums = ConcreteDataType::numerics(); + assert_eq!(10, nums.len()); + } + + #[test] + fn test_as_list() { + let list_type = ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()); + assert_eq!( + ListType::new(ConcreteDataType::int32_datatype()), + *list_type.as_list().unwrap() + ); + assert!(ConcreteDataType::int32_datatype().as_list().is_none()); + } } diff --git a/src/datatypes/src/error.rs b/src/datatypes/src/error.rs index 50b49cf2b4..2cb8553a90 100644 --- a/src/datatypes/src/error.rs +++ b/src/datatypes/src/error.rs @@ -99,6 +99,12 @@ pub enum Error { #[snafu(display("Duplicated metadata for {}", key))] DuplicateMeta { key: String, backtrace: Backtrace }, + + #[snafu(display("Failed to convert value into scalar value, reason: {}", reason))] + ToScalarValue { + reason: String, + backtrace: Backtrace, + }, } impl ErrorExt for Error { diff --git a/src/datatypes/src/lib.rs b/src/datatypes/src/lib.rs index f6f6db112a..3051c7a4b3 100644 --- a/src/datatypes/src/lib.rs +++ b/src/datatypes/src/lib.rs @@ -20,9 +20,10 @@ pub mod data_type; pub mod error; pub mod macros; pub mod prelude; -mod scalars; +pub mod scalars; pub mod schema; pub mod serialize; +pub mod timestamp; pub mod type_id; pub mod types; pub mod value; diff --git a/src/datatypes/src/macros.rs b/src/datatypes/src/macros.rs index 18be9fa375..37c0a42e3f 100644 --- a/src/datatypes/src/macros.rs +++ b/src/datatypes/src/macros.rs @@ -12,27 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -///! Some helper macros for datatypes, copied from databend. -#[macro_export] -macro_rules! for_all_scalar_types { - ($macro:tt $(, $x:tt)*) => { - $macro! { - [$($x),*], - { i8 }, - { i16 }, - { i32 }, - { i64 }, - { u8 }, - { u16 }, - { u32 }, - { u64 }, - { f32 }, - { f64 }, - { bool }, - } - }; -} +//! Some helper macros for datatypes, copied from databend. +/// Apply the macro rules to all primitive types. #[macro_export] macro_rules! for_all_primitive_types { ($macro:tt $(, $x:tt)*) => { @@ -52,6 +34,8 @@ macro_rules! for_all_primitive_types { }; } +/// Match the logical type and apply `$body` to all primitive types and +/// `nbody` to other types. #[macro_export] macro_rules! with_match_primitive_type_id { ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ @@ -62,17 +46,21 @@ macro_rules! with_match_primitive_type_id { } use $crate::type_id::LogicalTypeId; + use $crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, + }; match $key_type { - LogicalTypeId::Int8 => __with_ty__! { i8 }, - LogicalTypeId::Int16 => __with_ty__! { i16 }, - LogicalTypeId::Int32 => __with_ty__! { i32 }, - LogicalTypeId::Int64 => __with_ty__! { i64 }, - LogicalTypeId::UInt8 => __with_ty__! { u8 }, - LogicalTypeId::UInt16 => __with_ty__! { u16 }, - LogicalTypeId::UInt32 => __with_ty__! { u32 }, - LogicalTypeId::UInt64 => __with_ty__! { u64 }, - LogicalTypeId::Float32 => __with_ty__! { f32 }, - LogicalTypeId::Float64 => __with_ty__! { f64 }, + LogicalTypeId::Int8 => __with_ty__! { Int8Type }, + LogicalTypeId::Int16 => __with_ty__! { Int16Type }, + LogicalTypeId::Int32 => __with_ty__! { Int32Type }, + LogicalTypeId::Int64 => __with_ty__! { Int64Type }, + LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, + LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, + LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, + LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, + LogicalTypeId::Float32 => __with_ty__! { Float32Type }, + LogicalTypeId::Float64 => __with_ty__! { Float64Type }, _ => $nbody, } diff --git a/src/datatypes/src/prelude.rs b/src/datatypes/src/prelude.rs index 014a40efaf..b1afe93042 100644 --- a/src/datatypes/src/prelude.rs +++ b/src/datatypes/src/prelude.rs @@ -16,8 +16,6 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; pub use crate::macros::*; pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; pub use crate::type_id::LogicalTypeId; -pub use crate::types::Primitive; +pub use crate::types::{LogicalPrimitiveType, WrapperType}; pub use crate::value::{Value, ValueRef}; -pub use crate::vectors::{ - Helper as VectorHelper, MutableVector, Validity, Vector, VectorBuilder, VectorRef, -}; +pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; diff --git a/src/datatypes/src/scalars.rs b/src/datatypes/src/scalars.rs index ddb8eff007..327ebaa629 100644 --- a/src/datatypes/src/scalars.rs +++ b/src/datatypes/src/scalars.rs @@ -14,11 +14,17 @@ use std::any::Any; -use common_time::{Date, DateTime, Timestamp}; +use common_time::{Date, DateTime}; -use crate::prelude::*; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::*; +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, +}; +use crate::value::{ListValue, ListValueRef, Value}; +use crate::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, + PrimitiveVector, StringVector, Vector, +}; fn get_iter_capacity>(iter: &I) -> usize { match iter.size_hint() { @@ -35,7 +41,7 @@ where for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, { type VectorType: ScalarVector; - type RefType<'a>: ScalarRef<'a, ScalarType = Self, VectorType = Self::VectorType> + type RefType<'a>: ScalarRef<'a, ScalarType = Self> where Self: 'a; /// Get a reference of the current value. @@ -46,7 +52,6 @@ where } pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { - type VectorType: ScalarVector = Self>; /// The corresponding [`Scalar`] type. type ScalarType: Scalar = Self>; @@ -63,7 +68,7 @@ where { type OwnedItem: Scalar; /// The reference item of this vector. - type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem, VectorType = Self> + type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> where Self: 'a; @@ -137,47 +142,46 @@ pub trait ScalarVectorBuilder: MutableVector { fn finish(&mut self) -> Self::VectorType; } -macro_rules! impl_primitive_scalar_type { - ($native:ident) => { - impl Scalar for $native { - type VectorType = PrimitiveVector<$native>; - type RefType<'a> = $native; +macro_rules! impl_scalar_for_native { + ($Native: ident, $DataType: ident) => { + impl Scalar for $Native { + type VectorType = PrimitiveVector<$DataType>; + type RefType<'a> = $Native; #[inline] - fn as_scalar_ref(&self) -> $native { + fn as_scalar_ref(&self) -> $Native { *self } #[allow(clippy::needless_lifetimes)] #[inline] - fn upcast_gat<'short, 'long: 'short>(long: $native) -> $native { + fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { long } } /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. - impl<'a> ScalarRef<'a> for $native { - type VectorType = PrimitiveVector<$native>; - type ScalarType = $native; + impl<'a> ScalarRef<'a> for $Native { + type ScalarType = $Native; #[inline] - fn to_owned_scalar(&self) -> $native { + fn to_owned_scalar(&self) -> $Native { *self } } }; } -impl_primitive_scalar_type!(u8); -impl_primitive_scalar_type!(u16); -impl_primitive_scalar_type!(u32); -impl_primitive_scalar_type!(u64); -impl_primitive_scalar_type!(i8); -impl_primitive_scalar_type!(i16); -impl_primitive_scalar_type!(i32); -impl_primitive_scalar_type!(i64); -impl_primitive_scalar_type!(f32); -impl_primitive_scalar_type!(f64); +impl_scalar_for_native!(u8, UInt8Type); +impl_scalar_for_native!(u16, UInt16Type); +impl_scalar_for_native!(u32, UInt32Type); +impl_scalar_for_native!(u64, UInt64Type); +impl_scalar_for_native!(i8, Int8Type); +impl_scalar_for_native!(i16, Int16Type); +impl_scalar_for_native!(i32, Int32Type); +impl_scalar_for_native!(i64, Int64Type); +impl_scalar_for_native!(f32, Float32Type); +impl_scalar_for_native!(f64, Float64Type); impl Scalar for bool { type VectorType = BooleanVector; @@ -196,7 +200,6 @@ impl Scalar for bool { } impl<'a> ScalarRef<'a> for bool { - type VectorType = BooleanVector; type ScalarType = bool; #[inline] @@ -221,7 +224,6 @@ impl Scalar for String { } impl<'a> ScalarRef<'a> for &'a str { - type VectorType = StringVector; type ScalarType = String; #[inline] @@ -246,7 +248,6 @@ impl Scalar for Vec { } impl<'a> ScalarRef<'a> for &'a [u8] { - type VectorType = BinaryVector; type ScalarType = Vec; #[inline] @@ -269,7 +270,6 @@ impl Scalar for Date { } impl<'a> ScalarRef<'a> for Date { - type VectorType = DateVector; type ScalarType = Date; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -291,7 +291,6 @@ impl Scalar for DateTime { } impl<'a> ScalarRef<'a> for DateTime { - type VectorType = DateTimeVector; type ScalarType = DateTime; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -299,27 +298,7 @@ impl<'a> ScalarRef<'a> for DateTime { } } -impl Scalar for Timestamp { - type VectorType = TimestampVector; - type RefType<'a> = Timestamp; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for Timestamp { - type VectorType = TimestampVector; - type ScalarType = Timestamp; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} +// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. impl Scalar for ListValue { type VectorType = ListVector; @@ -335,7 +314,6 @@ impl Scalar for ListValue { } impl<'a> ScalarRef<'a> for ListValueRef<'a> { - type VectorType = ListVector; type ScalarType = ListValue; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -357,8 +335,9 @@ impl<'a> ScalarRef<'a> for ListValueRef<'a> { #[cfg(test)] mod tests { use super::*; - use crate::vectors::binary::BinaryVector; - use crate::vectors::primitive::Int32Vector; + use crate::data_type::ConcreteDataType; + use crate::timestamp::TimestampSecond; + use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; fn build_vector_from_slice(items: &[Option>]) -> T { let mut builder = T::Builder::with_capacity(items.len()); @@ -454,11 +433,11 @@ mod tests { #[test] fn test_build_timestamp_vector() { - let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; - let vector: TimestampVector = build_vector_from_slice(&expect); + let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; + let vector: TimestampSecondVector = build_vector_from_slice(&expect); assert_vector_eq(&expect, &vector); let val = vector.get_data(0).unwrap(); assert_eq!(val, val.as_scalar_ref()); - assert_eq!(10, val.to_owned_scalar().value()); + assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); } } diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index e3a5661dfd..4952e36cc0 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -12,129 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod column_schema; mod constraint; mod raw; use std::collections::HashMap; use std::sync::Arc; -pub use arrow::datatypes::Metadata; use arrow::datatypes::{Field, Schema as ArrowSchema}; use datafusion_common::DFSchemaRef; -use serde::{Deserialize, Serialize}; use snafu::{ensure, ResultExt}; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, DeserializeSnafu, Error, Result, SerializeSnafu}; +use crate::data_type::DataType; +use crate::error::{self, Error, Result}; +pub use crate::schema::column_schema::{ColumnSchema, Metadata}; pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::raw::RawSchema; -use crate::vectors::VectorRef; -/// Key used to store whether the column is time index in arrow field's metadata. -const TIME_INDEX_KEY: &str = "greptime:time_index"; /// Key used to store version number of the schema in metadata. const VERSION_KEY: &str = "greptime:version"; -/// Key used to store default constraint in arrow field's metadata. -const ARROW_FIELD_DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint"; - -/// Schema of a column, used as an immutable struct. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ColumnSchema { - pub name: String, - pub data_type: ConcreteDataType, - is_nullable: bool, - is_time_index: bool, - default_constraint: Option, - metadata: Metadata, -} - -impl ColumnSchema { - pub fn new>( - name: T, - data_type: ConcreteDataType, - is_nullable: bool, - ) -> ColumnSchema { - ColumnSchema { - name: name.into(), - data_type, - is_nullable, - is_time_index: false, - default_constraint: None, - metadata: Metadata::new(), - } - } - - #[inline] - pub fn is_time_index(&self) -> bool { - self.is_time_index - } - - #[inline] - pub fn is_nullable(&self) -> bool { - self.is_nullable - } - - #[inline] - pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> { - self.default_constraint.as_ref() - } - - #[inline] - pub fn metadata(&self) -> &Metadata { - &self.metadata - } - - pub fn with_time_index(mut self, is_time_index: bool) -> Self { - self.is_time_index = is_time_index; - if is_time_index { - self.metadata - .insert(TIME_INDEX_KEY.to_string(), "true".to_string()); - } else { - self.metadata.remove(TIME_INDEX_KEY); - } - self - } - - pub fn with_default_constraint( - mut self, - default_constraint: Option, - ) -> Result { - if let Some(constraint) = &default_constraint { - constraint.validate(&self.data_type, self.is_nullable)?; - } - - self.default_constraint = default_constraint; - Ok(self) - } - - /// Creates a new [`ColumnSchema`] with given metadata. - pub fn with_metadata(mut self, metadata: Metadata) -> Self { - self.metadata = metadata; - self - } - - pub fn create_default_vector(&self, num_rows: usize) -> Result> { - match &self.default_constraint { - Some(c) => c - .create_default_vector(&self.data_type, self.is_nullable, num_rows) - .map(Some), - None => { - if self.is_nullable { - // No default constraint, use null as default value. - // TODO(yingwen): Use NullVector once it supports setting logical type. - ColumnDefaultConstraint::null_value() - .create_default_vector(&self.data_type, self.is_nullable, num_rows) - .map(Some) - } else { - Ok(None) - } - } - } - } -} /// A common schema, should be immutable. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Schema { column_schemas: Vec, name_to_index: HashMap, @@ -232,7 +131,7 @@ impl Schema { } #[inline] - pub fn metadata(&self) -> &Metadata { + pub fn metadata(&self) -> &HashMap { &self.arrow_schema.metadata } } @@ -244,7 +143,7 @@ pub struct SchemaBuilder { fields: Vec, timestamp_index: Option, version: u32, - metadata: Metadata, + metadata: HashMap, } impl TryFrom> for SchemaBuilder { @@ -293,7 +192,7 @@ impl SchemaBuilder { self.metadata .insert(VERSION_KEY.to_string(), self.version.to_string()); - let arrow_schema = ArrowSchema::from(self.fields).with_metadata(self.metadata); + let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); Ok(Schema { column_schemas: self.column_schemas, @@ -348,7 +247,7 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us let column_schema = &column_schemas[timestamp_index]; ensure!( - column_schema.data_type.is_timestamp(), + column_schema.data_type.is_timestamp_compatible(), error::InvalidTimestampIndexSnafu { index: timestamp_index, } @@ -365,58 +264,6 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us pub type SchemaRef = Arc; -impl TryFrom<&Field> for ColumnSchema { - type Error = Error; - - fn try_from(field: &Field) -> Result { - let data_type = ConcreteDataType::try_from(&field.data_type)?; - let mut metadata = field.metadata.clone(); - let default_constraint = match metadata.remove(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) { - Some(json) => Some(serde_json::from_str(&json).context(DeserializeSnafu { json })?), - None => None, - }; - let is_time_index = metadata.contains_key(TIME_INDEX_KEY); - - Ok(ColumnSchema { - name: field.name.clone(), - data_type, - is_nullable: field.is_nullable, - is_time_index, - default_constraint, - metadata, - }) - } -} - -impl TryFrom<&ColumnSchema> for Field { - type Error = Error; - - fn try_from(column_schema: &ColumnSchema) -> Result { - let mut metadata = column_schema.metadata.clone(); - if let Some(value) = &column_schema.default_constraint { - // Adds an additional metadata to store the default constraint. - let old = metadata.insert( - ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), - serde_json::to_string(&value).context(SerializeSnafu)?, - ); - - ensure!( - old.is_none(), - error::DuplicateMetaSnafu { - key: ARROW_FIELD_DEFAULT_CONSTRAINT_KEY, - } - ); - } - - Ok(Field::new( - column_schema.name.clone(), - column_schema.data_type.as_arrow_type(), - column_schema.is_nullable(), - ) - .with_metadata(metadata)) - } -} - impl TryFrom> for Schema { type Error = Error; @@ -425,7 +272,7 @@ impl TryFrom> for Schema { let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); for field in &arrow_schema.fields { let column_schema = ColumnSchema::try_from(field)?; - name_to_index.insert(field.name.clone(), column_schemas.len()); + name_to_index.insert(field.name().to_string(), column_schemas.len()); column_schemas.push(column_schema); } @@ -475,7 +322,7 @@ impl TryFrom for Schema { } } -fn try_parse_version(metadata: &Metadata, key: &str) -> Result { +fn try_parse_version(metadata: &HashMap, key: &str) -> Result { if let Some(value) = metadata.get(key) { let version = value .parse() @@ -489,127 +336,8 @@ fn try_parse_version(metadata: &Metadata, key: &str) -> Result { #[cfg(test)] mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use super::*; - use crate::value::Value; - - #[test] - fn test_column_schema() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("test", field.name); - assert_eq!(ArrowDataType::Int32, field.data_type); - assert!(field.is_nullable); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_default_constraint() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::from(99)))) - .unwrap(); - assert!(column_schema - .metadata() - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_none()); - - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("test", field.name); - assert_eq!(ArrowDataType::Int32, field.data_type); - assert!(field.is_nullable); - assert_eq!( - "{\"Value\":{\"Int32\":99}}", - field - .metadata - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .unwrap() - ); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_metadata() { - let mut metadata = Metadata::new(); - metadata.insert("k1".to_string(), "v1".to_string()); - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_metadata(metadata) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - assert_eq!("v1", column_schema.metadata().get("k1").unwrap()); - assert!(column_schema - .metadata() - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_none()); - - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("v1", field.metadata.get("k1").unwrap()); - assert!(field - .metadata - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_some()); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_duplicate_metadata() { - let mut metadata = Metadata::new(); - metadata.insert( - ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), - "v1".to_string(), - ); - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_metadata(metadata) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - Field::try_from(&column_schema).unwrap_err(); - } - - #[test] - fn test_column_schema_invalid_default_constraint() { - ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap_err(); - } - - #[test] - fn test_column_default_constraint_try_into_from() { - let default_constraint = ColumnDefaultConstraint::Value(Value::from(42i64)); - - let bytes: Vec = default_constraint.clone().try_into().unwrap(); - let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap(); - - assert_eq!(default_constraint, from_value); - } - - #[test] - fn test_column_schema_create_default_null() { - // Implicit default null. - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); - let v = column_schema.create_default_vector(5).unwrap().unwrap(); - assert_eq!(5, v.len()); - assert!(v.only_null()); - - // Explicit default null. - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - let v = column_schema.create_default_vector(5).unwrap().unwrap(); - assert_eq!(5, v.len()); - assert!(v.only_null()); - } - - #[test] - fn test_column_schema_no_default() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false); - assert!(column_schema.create_default_vector(5).unwrap().is_none()); - } + use crate::data_type::ConcreteDataType; #[test] fn test_build_empty_schema() { @@ -664,8 +392,12 @@ mod tests { fn test_schema_with_timestamp() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas.clone()) .unwrap() diff --git a/src/datatypes2/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs similarity index 100% rename from src/datatypes2/src/schema/column_schema.rs rename to src/datatypes/src/schema/column_schema.rs diff --git a/src/datatypes/src/schema/constraint.rs b/src/datatypes/src/schema/constraint.rs index 3750fcebcf..4dd3ecc14b 100644 --- a/src/datatypes/src/schema/constraint.rs +++ b/src/datatypes/src/schema/constraint.rs @@ -22,7 +22,7 @@ use snafu::{ensure, ResultExt}; use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::value::Value; -use crate::vectors::{Int64Vector, TimestampVector, VectorRef}; +use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; const CURRENT_TIMESTAMP: &str = "current_timestamp()"; @@ -81,7 +81,7 @@ impl ColumnDefaultConstraint { error::UnsupportedDefaultExprSnafu { expr } ); ensure!( - data_type.is_timestamp(), + data_type.is_timestamp_compatible(), error::DefaultValueTypeSnafu { reason: "return value of the function must has timestamp type", } @@ -162,8 +162,10 @@ fn create_current_timestamp_vector( data_type: &ConcreteDataType, num_rows: usize, ) -> Result { + // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector + // to other data type and avoid this match. match data_type { - ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampVector::from_values( + ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( std::iter::repeat(util::current_time_millis()).take(num_rows), ))), ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( @@ -217,7 +219,7 @@ mod tests { fn test_validate_function_constraint() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); constraint - .validate(&ConcreteDataType::timestamp_millis_datatype(), false) + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) .unwrap(); constraint .validate(&ConcreteDataType::boolean_datatype(), false) @@ -225,7 +227,7 @@ mod tests { let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); constraint - .validate(&ConcreteDataType::timestamp_millis_datatype(), false) + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) .unwrap_err(); } @@ -262,7 +264,7 @@ mod tests { fn test_create_default_vector_by_func() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); // Timestamp type. - let data_type = ConcreteDataType::timestamp_millis_datatype(); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); let v = constraint .create_default_vector(&data_type, false, 4) .unwrap(); @@ -286,7 +288,7 @@ mod tests { ); let constraint = ColumnDefaultConstraint::Function("no".to_string()); - let data_type = ConcreteDataType::timestamp_millis_datatype(); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); constraint .create_default_vector(&data_type, false, 4) .unwrap_err(); diff --git a/src/datatypes/src/schema/raw.rs b/src/datatypes/src/schema/raw.rs index f415a1ab85..75f0853b4b 100644 --- a/src/datatypes/src/schema/raw.rs +++ b/src/datatypes/src/schema/raw.rs @@ -20,7 +20,7 @@ use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; /// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). /// /// This struct only contains necessary data to recover the Schema. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct RawSchema { pub column_schemas: Vec, pub timestamp_index: Option, @@ -56,8 +56,12 @@ mod tests { fn test_raw_convert() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas) .unwrap() diff --git a/src/datatypes2/src/timestamp.rs b/src/datatypes/src/timestamp.rs similarity index 89% rename from src/datatypes2/src/timestamp.rs rename to src/datatypes/src/timestamp.rs index f14e91a6c6..82d68ae662 100644 --- a/src/datatypes2/src/timestamp.rs +++ b/src/datatypes/src/timestamp.rs @@ -104,6 +104,12 @@ macro_rules! define_timestamp_with_unit { []::from_native(val) } } + + impl From<[]> for i64{ + fn from(val: []) -> Self { + val.0.value() + } + } } }; } @@ -117,6 +123,18 @@ define_timestamp_with_unit!(Nanosecond); mod tests { use super::*; + #[test] + fn test_to_serde_json_value() { + let ts = TimestampSecond::new(123); + let val = serde_json::Value::from(ts); + match val { + serde_json::Value::String(s) => { + assert_eq!("1970-01-01 00:02:03+0000", s); + } + _ => unreachable!(), + } + } + #[test] fn test_timestamp_scalar() { let ts = TimestampSecond::new(123); diff --git a/src/datatypes/src/type_id.rs b/src/datatypes/src/type_id.rs index fa11430dec..bcb7ea52b1 100644 --- a/src/datatypes/src/type_id.rs +++ b/src/datatypes/src/type_id.rs @@ -42,7 +42,10 @@ pub enum LogicalTypeId { /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. DateTime, - Timestamp, + TimestampSecond, + TimestampMillisecond, + TimestampMicrosecond, + TimestampNanosecond, List, } @@ -74,7 +77,14 @@ impl LogicalTypeId { LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), LogicalTypeId::Date => ConcreteDataType::date_datatype(), LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit + LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + LogicalTypeId::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + LogicalTypeId::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), LogicalTypeId::List => { ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) } diff --git a/src/datatypes/src/types.rs b/src/datatypes/src/types.rs index aabeb59db3..8f40c563de 100644 --- a/src/datatypes/src/types.rs +++ b/src/datatypes/src/types.rs @@ -14,25 +14,27 @@ mod binary_type; mod boolean_type; -mod date; -mod datetime; +mod date_type; +mod datetime_type; mod list_type; mod null_type; -mod primitive_traits; mod primitive_type; mod string_type; -mod timestamp; + +mod timestamp_type; pub use binary_type::BinaryType; pub use boolean_type::BooleanType; -pub use date::DateType; -pub use datetime::DateTimeType; +pub use date_type::DateType; +pub use datetime_type::DateTimeType; pub use list_type::ListType; pub use null_type::NullType; -pub use primitive_traits::{OrdPrimitive, Primitive}; pub use primitive_type::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, PrimitiveElement, - PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + NativeType, OrdPrimitive, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, }; pub use string_type::StringType; -pub use timestamp::TimestampType; +pub use timestamp_type::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, TimestampType, +}; diff --git a/src/datatypes/src/types/binary_type.rs b/src/datatypes/src/types/binary_type.rs index 13922ff063..0d06724fff 100644 --- a/src/datatypes/src/types/binary_type.rs +++ b/src/datatypes/src/types/binary_type.rs @@ -53,4 +53,8 @@ impl DataType for BinaryType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BinaryVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/boolean_type.rs b/src/datatypes/src/types/boolean_type.rs index 4566f1d826..36d92169eb 100644 --- a/src/datatypes/src/types/boolean_type.rs +++ b/src/datatypes/src/types/boolean_type.rs @@ -52,4 +52,8 @@ impl DataType for BooleanType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BooleanVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/date.rs b/src/datatypes/src/types/date.rs deleted file mode 100644 index 8d2cca12fa..0000000000 --- a/src/datatypes/src/types/date.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::DataType; -use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; -use crate::vectors::{DateVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DateType; - -impl DataType for DateType { - fn name(&self) -> &str { - "Date" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Date - } - - fn default_value(&self) -> Value { - Value::Date(Default::default()) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Date32 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(DateVectorBuilder::with_capacity(capacity)) - } -} - -impl DateType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} diff --git a/src/datatypes2/src/types/date_type.rs b/src/datatypes/src/types/date_type.rs similarity index 98% rename from src/datatypes2/src/types/date_type.rs rename to src/datatypes/src/types/date_type.rs index 052b837a3d..afd482359d 100644 --- a/src/datatypes2/src/types/date_type.rs +++ b/src/datatypes/src/types/date_type.rs @@ -59,6 +59,7 @@ impl LogicalPrimitiveType for DateType { type ArrowPrimitive = Date32Type; type Native = i32; type Wrapper = Date; + type LargestType = Self; fn build_data_type() -> ConcreteDataType { ConcreteDataType::date_datatype() diff --git a/src/datatypes/src/types/datetime.rs b/src/datatypes/src/types/datetime.rs deleted file mode 100644 index 6166c73f37..0000000000 --- a/src/datatypes/src/types/datetime.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::prelude::{LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; -use crate::vectors::{DateTimeVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DateTimeType; - -const DATE_TIME_TYPE_NAME: &str = "DateTime"; - -/// [DateTimeType] represents the seconds elapsed since UNIX EPOCH. -impl DataType for DateTimeType { - fn name(&self) -> &str { - DATE_TIME_TYPE_NAME - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::DateTime - } - - fn default_value(&self) -> Value { - Value::DateTime(Default::default()) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Date64 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(DateTimeVectorBuilder::with_capacity(capacity)) - } -} - -impl DateTimeType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } - - pub fn name() -> &'static str { - DATE_TIME_TYPE_NAME - } -} diff --git a/src/datatypes2/src/types/datetime_type.rs b/src/datatypes/src/types/datetime_type.rs similarity index 98% rename from src/datatypes2/src/types/datetime_type.rs rename to src/datatypes/src/types/datetime_type.rs index d74a02effe..ccd810eee7 100644 --- a/src/datatypes2/src/types/datetime_type.rs +++ b/src/datatypes/src/types/datetime_type.rs @@ -57,6 +57,7 @@ impl LogicalPrimitiveType for DateTimeType { type ArrowPrimitive = Date64Type; type Native = i64; type Wrapper = DateTime; + type LargestType = Self; fn build_data_type() -> ConcreteDataType { ConcreteDataType::datetime_datatype() diff --git a/src/datatypes/src/types/list_type.rs b/src/datatypes/src/types/list_type.rs index 1ada109011..3c8535810d 100644 --- a/src/datatypes/src/types/list_type.rs +++ b/src/datatypes/src/types/list_type.rs @@ -15,15 +15,17 @@ use arrow::datatypes::{DataType as ArrowDataType, Field}; use serde::{Deserialize, Serialize}; -use crate::prelude::*; -use crate::value::ListValue; +use crate::data_type::{ConcreteDataType, DataType}; +use crate::type_id::LogicalTypeId; +use crate::value::{ListValue, Value}; use crate::vectors::{ListVectorBuilder, MutableVector}; /// Used to represent the List datatype. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ListType { - /// The type of List's inner data. - inner: Box, + /// The type of List's item. + // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. + item_type: Box, } impl Default for ListType { @@ -33,11 +35,18 @@ impl Default for ListType { } impl ListType { - pub fn new(datatype: ConcreteDataType) -> Self { + /// Create a new `ListType` whose item's data type is `item_type`. + pub fn new(item_type: ConcreteDataType) -> Self { ListType { - inner: Box::new(datatype), + item_type: Box::new(item_type), } } + + /// Returns the item data type. + #[inline] + pub fn item_type(&self) -> &ConcreteDataType { + &self.item_type + } } impl DataType for ListType { @@ -50,20 +59,24 @@ impl DataType for ListType { } fn default_value(&self) -> Value { - Value::List(ListValue::new(None, *self.inner.clone())) + Value::List(ListValue::new(None, *self.item_type.clone())) } fn as_arrow_type(&self) -> ArrowDataType { - let field = Box::new(Field::new("item", self.inner.as_arrow_type(), true)); + let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); ArrowDataType::List(field) } fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(ListVectorBuilder::with_type_capacity( - *self.inner.clone(), + *self.item_type.clone(), capacity, )) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } #[cfg(test)] @@ -84,5 +97,6 @@ mod tests { ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Boolean, true))), t.as_arrow_type() ); + assert_eq!(ConcreteDataType::boolean_datatype(), *t.item_type()); } } diff --git a/src/datatypes/src/types/null_type.rs b/src/datatypes/src/types/null_type.rs index a0b027dd14..b9bb2dc752 100644 --- a/src/datatypes/src/types/null_type.rs +++ b/src/datatypes/src/types/null_type.rs @@ -27,7 +27,7 @@ pub struct NullType; impl NullType { pub fn arc() -> DataTypeRef { - Arc::new(Self) + Arc::new(NullType) } } @@ -51,4 +51,8 @@ impl DataType for NullType { fn create_mutable_vector(&self, _capacity: usize) -> Box { Box::new(NullVectorBuilder::default()) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/primitive_traits.rs b/src/datatypes/src/types/primitive_traits.rs deleted file mode 100644 index e900ba217e..0000000000 --- a/src/datatypes/src/types/primitive_traits.rs +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; - -use arrow::compute::arithmetics::basic::NativeArithmetics; -use arrow::types::NativeType; -use num::NumCast; - -use crate::prelude::Scalar; -use crate::value::{IntoValueRef, Value}; - -/// Primitive type. -pub trait Primitive: - PartialOrd - + Default - + Clone - + Copy - + Into - + IntoValueRef<'static> - + NativeType - + serde::Serialize - + NativeArithmetics - + NumCast - + Scalar -{ - /// Largest numeric type this primitive type can be cast to. - type LargestType: Primitive; -} - -macro_rules! impl_primitive { - ($Type:ident, $LargestType: ident) => { - impl Primitive for $Type { - type LargestType = $LargestType; - } - }; -} - -impl_primitive!(u8, u64); -impl_primitive!(u16, u64); -impl_primitive!(u32, u64); -impl_primitive!(u64, u64); -impl_primitive!(i8, i64); -impl_primitive!(i16, i64); -impl_primitive!(i32, i64); -impl_primitive!(i64, i64); -impl_primitive!(f32, f64); -impl_primitive!(f64, f64); - -/// A new type for [Primitive], complement the `Ord` feature for it. Wrapping not ordered -/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that -/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. -#[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrdPrimitive(pub T); - -impl OrdPrimitive { - pub fn as_primitive(&self) -> T { - self.0 - } -} - -impl Eq for OrdPrimitive {} - -impl PartialOrd for OrdPrimitive { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for OrdPrimitive { - fn cmp(&self, other: &Self) -> Ordering { - self.0.into().cmp(&other.0.into()) - } -} - -impl From> for Value { - fn from(p: OrdPrimitive) -> Self { - p.0.into() - } -} - -#[cfg(test)] -mod tests { - use std::collections::BinaryHeap; - - use super::*; - - #[test] - fn test_ord_primitive() { - struct Foo - where - T: Primitive, - { - heap: BinaryHeap>, - } - - impl Foo - where - T: Primitive, - { - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - self.heap.push(value); - } - } - - macro_rules! test { - ($Type:ident) => { - let mut foo = Foo::<$Type> { - heap: BinaryHeap::new(), - }; - foo.push($Type::default()); - }; - } - - test!(u8); - test!(u16); - test!(u32); - test!(u64); - test!(i8); - test!(i16); - test!(i32); - test!(i64); - test!(f32); - test!(f64); - } -} diff --git a/src/datatypes/src/types/primitive_type.rs b/src/datatypes/src/types/primitive_type.rs index b9f07ce82c..ea752cf8de 100644 --- a/src/datatypes/src/types/primitive_type.rs +++ b/src/datatypes/src/types/primitive_type.rs @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::TypeId; -use std::marker::PhantomData; +use std::cmp::Ordering; +use std::fmt; -use arrow::array::PrimitiveArray; -use arrow::datatypes::DataType as ArrowDataType; -use paste::paste; +use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; +use common_time::{Date, DateTime}; +use num::NumCast; use serde::{Deserialize, Serialize}; use snafu::OptionExt; @@ -25,92 +25,227 @@ use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; use crate::type_id::LogicalTypeId; -use crate::types::primitive_traits::Primitive; +use crate::types::{DateTimeType, DateType}; use crate::value::{Value, ValueRef}; use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; -#[derive(Clone, Serialize, Deserialize)] -pub struct PrimitiveType { - #[serde(skip)] - _phantom: PhantomData, +/// Data types that can be used as arrow's native type. +pub trait NativeType: ArrowNativeType + NumCast {} + +macro_rules! impl_native_type { + ($Type: ident) => { + impl NativeType for $Type {} + }; } -impl PartialEq> for PrimitiveType { - fn eq(&self, _other: &PrimitiveType) -> bool { - TypeId::of::() == TypeId::of::() - } -} +impl_native_type!(u8); +impl_native_type!(u16); +impl_native_type!(u32); +impl_native_type!(u64); +impl_native_type!(i8); +impl_native_type!(i16); +impl_native_type!(i32); +impl_native_type!(i64); +impl_native_type!(f32); +impl_native_type!(f64); -impl Eq for PrimitiveType {} - -/// A trait that provide helper methods for a primitive type to implementing the [PrimitiveVector]. -pub trait PrimitiveElement -where - for<'a> Self: Primitive - + Scalar> - + ScalarRef<'a, ScalarType = Self, VectorType = PrimitiveVector> - + Scalar = Self>, +/// Represents the wrapper type that wraps a native type using the `newtype pattern`, +/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native +/// type `i32`. +pub trait WrapperType: + Copy + + Send + + Sync + + fmt::Debug + + for<'a> Scalar = Self> + + PartialEq + + Into + + Into> + + Serialize + + Into { + /// Logical primitive type that this wrapper type belongs to. + type LogicalType: LogicalPrimitiveType; + /// The underlying native type. + type Native: NativeType; + + /// Convert native type into this wrapper type. + fn from_native(value: Self::Native) -> Self; + + /// Convert this wrapper type into native type. + fn into_native(self) -> Self::Native; +} + +/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. +pub trait LogicalPrimitiveType: 'static + Sized { + /// Arrow primitive type of this logical type. + type ArrowPrimitive: ArrowPrimitiveType; + /// Native (physical) type of this logical type. + type Native: NativeType; + /// Wrapper type that the vector returns. + type Wrapper: WrapperType + + for<'a> Scalar, RefType<'a> = Self::Wrapper> + + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; + /// Largest type this primitive type can cast to. + type LargestType: LogicalPrimitiveType; + /// Construct the data type struct. fn build_data_type() -> ConcreteDataType; - /// Returns the name of the type id. - fn type_name() -> String; + /// Return the name of the type. + fn type_name() -> &'static str; /// Dynamic cast the vector to the concrete vector type. - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray>; + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; /// Cast value ref to the primitive type. - fn cast_value_ref(value: ValueRef) -> Result>; + fn cast_value_ref(value: ValueRef) -> Result>; } -macro_rules! impl_primitive_element { - ($Type:ident, $TypeId:ident) => { - paste::paste! { - impl PrimitiveElement for $Type { - fn build_data_type() -> ConcreteDataType { - ConcreteDataType::$TypeId(PrimitiveType::<$Type>::default()) - } +/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered +/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that +/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct OrdPrimitive(pub T); - fn type_name() -> String { - stringify!($TypeId).to_string() - } +impl OrdPrimitive { + pub fn as_primitive(&self) -> T::Native { + self.0.into_native() + } +} - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<$Type>> { - let primitive_vector = vector - .as_any() - .downcast_ref::>() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to cast {} to vector of primitive type {}", - vector.vector_type_name(), - stringify!($TypeId) - ), - })?; - Ok(&primitive_vector.array) - } +impl Eq for OrdPrimitive {} - fn cast_value_ref(value: ValueRef) -> Result> { - match value { - ValueRef::Null => Ok(None), - ValueRef::$TypeId(v) => Ok(Some(v.into())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value {:?} to primitive type {}", - other, - stringify!($TypeId), - ), - }.fail(), +impl PartialOrd for OrdPrimitive { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for OrdPrimitive { + fn cmp(&self, other: &Self) -> Ordering { + Into::::into(self.0).cmp(&Into::::into(other.0)) + } +} + +impl From> for Value { + fn from(p: OrdPrimitive) -> Self { + p.0.into() + } +} + +macro_rules! impl_wrapper { + ($Type: ident, $LogicalType: ident) => { + impl WrapperType for $Type { + type LogicalType = $LogicalType; + type Native = $Type; + + fn from_native(value: Self::Native) -> Self { + value + } + + fn into_native(self) -> Self::Native { + self + } + } + }; +} + +impl_wrapper!(u8, UInt8Type); +impl_wrapper!(u16, UInt16Type); +impl_wrapper!(u32, UInt32Type); +impl_wrapper!(u64, UInt64Type); +impl_wrapper!(i8, Int8Type); +impl_wrapper!(i16, Int16Type); +impl_wrapper!(i32, Int32Type); +impl_wrapper!(i64, Int64Type); +impl_wrapper!(f32, Float32Type); +impl_wrapper!(f64, Float64Type); + +impl WrapperType for Date { + type LogicalType = DateType; + type Native = i32; + + fn from_native(value: i32) -> Self { + Date::new(value) + } + + fn into_native(self) -> i32 { + self.val() + } +} + +impl WrapperType for DateTime { + type LogicalType = DateTimeType; + type Native = i64; + + fn from_native(value: Self::Native) -> Self { + DateTime::new(value) + } + + fn into_native(self) -> Self::Native { + self.val() + } +} + +macro_rules! define_logical_primitive_type { + ($Native: ident, $TypeId: ident, $DataType: ident, $Largest: ident) => { + // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit + // `struct DataType;` to ensure the serialized JSON string is compatible with previous + // implementation. + #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] + pub struct $DataType {} + + impl LogicalPrimitiveType for $DataType { + type ArrowPrimitive = arrow::datatypes::$DataType; + type Native = $Native; + type Wrapper = $Native; + type LargestType = $Largest; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::$TypeId($DataType::default()) + } + + fn type_name() -> &'static str { + stringify!($TypeId) + } + + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { + vector + .as_any() + .downcast_ref::>() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to vector of primitive type {}", + vector.vector_type_name(), + stringify!($TypeId) + ), + }) + } + + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::$TypeId(v) => Ok(Some(v.into())), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast value {:?} to primitive type {}", + other, + stringify!($TypeId), + ), } + .fail(), } } } }; } -macro_rules! impl_numeric { - ($Type:ident, $TypeId:ident) => { - impl DataType for PrimitiveType<$Type> { +macro_rules! define_non_timestamp_primitive { + ($Native: ident, $TypeId: ident, $DataType: ident, $Largest: ident) => { + define_logical_primitive_type!($Native, $TypeId, $DataType, $Largest); + + impl DataType for $DataType { fn name(&self) -> &str { stringify!($TypeId) } @@ -120,7 +255,7 @@ macro_rules! impl_numeric { } fn default_value(&self) -> Value { - $Type::default().into() + $Native::default().into() } fn as_arrow_type(&self) -> ArrowDataType { @@ -128,61 +263,99 @@ macro_rules! impl_numeric { } fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::<$Type>::with_capacity(capacity)) + Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) } - } - impl std::fmt::Debug for PrimitiveType<$Type> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.name()) + fn is_timestamp_compatible(&self) -> bool { + false } } - - impl Default for PrimitiveType<$Type> { - fn default() -> Self { - Self { - _phantom: PhantomData, - } - } - } - - impl_primitive_element!($Type, $TypeId); - - paste! { - pub type [<$TypeId Type>]=PrimitiveType<$Type>; - } }; } -impl_numeric!(u8, UInt8); -impl_numeric!(u16, UInt16); -impl_numeric!(u32, UInt32); -impl_numeric!(u64, UInt64); -impl_numeric!(i8, Int8); -impl_numeric!(i16, Int16); -impl_numeric!(i32, Int32); -impl_numeric!(i64, Int64); -impl_numeric!(f32, Float32); -impl_numeric!(f64, Float64); +define_non_timestamp_primitive!(u8, UInt8, UInt8Type, UInt64Type); +define_non_timestamp_primitive!(u16, UInt16, UInt16Type, UInt64Type); +define_non_timestamp_primitive!(u32, UInt32, UInt32Type, UInt64Type); +define_non_timestamp_primitive!(u64, UInt64, UInt64Type, UInt64Type); +define_non_timestamp_primitive!(i8, Int8, Int8Type, Int64Type); +define_non_timestamp_primitive!(i16, Int16, Int16Type, Int64Type); +define_non_timestamp_primitive!(i32, Int32, Int32Type, Int64Type); +define_non_timestamp_primitive!(f32, Float32, Float32Type, Float64Type); +define_non_timestamp_primitive!(f64, Float64, Float64Type, Float64Type); + +// Timestamp primitive: +define_logical_primitive_type!(i64, Int64, Int64Type, Int64Type); + +impl DataType for Int64Type { + fn name(&self) -> &str { + "Int64" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Int64 + } + + fn default_value(&self) -> Value { + Value::Int64(0) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Int64 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + true + } +} #[cfg(test)] mod tests { + use std::collections::BinaryHeap; + use super::*; #[test] - fn test_eq() { - assert_eq!(UInt8Type::default(), UInt8Type::default()); - assert_eq!(UInt16Type::default(), UInt16Type::default()); - assert_eq!(UInt32Type::default(), UInt32Type::default()); - assert_eq!(UInt64Type::default(), UInt64Type::default()); - assert_eq!(Int8Type::default(), Int8Type::default()); - assert_eq!(Int16Type::default(), Int16Type::default()); - assert_eq!(Int32Type::default(), Int32Type::default()); - assert_eq!(Int64Type::default(), Int64Type::default()); - assert_eq!(Float32Type::default(), Float32Type::default()); - assert_eq!(Float64Type::default(), Float64Type::default()); + fn test_ord_primitive() { + struct Foo + where + T: WrapperType, + { + heap: BinaryHeap>, + } - assert_ne!(Float32Type::default(), Float64Type::default()); - assert_ne!(Float32Type::default(), Int32Type::default()); + impl Foo + where + T: WrapperType, + { + fn push(&mut self, value: T) { + let value = OrdPrimitive::(value); + self.heap.push(value); + } + } + + macro_rules! test { + ($Type:ident) => { + let mut foo = Foo::<$Type> { + heap: BinaryHeap::new(), + }; + foo.push($Type::default()); + assert_eq!($Type::default(), foo.heap.pop().unwrap().as_primitive()); + }; + } + + test!(u8); + test!(u16); + test!(u32); + test!(u64); + test!(i8); + test!(i16); + test!(i32); + test!(i64); + test!(f32); + test!(f64); } } diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs index 736a3faac9..799cbbbdd3 100644 --- a/src/datatypes/src/types/string_type.rs +++ b/src/datatypes/src/types/string_type.rs @@ -18,9 +18,10 @@ use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::StringBytes; use serde::{Deserialize, Serialize}; -use crate::data_type::DataType; -use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; +use crate::data_type::{DataType, DataTypeRef}; +use crate::prelude::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::value::Value; use crate::vectors::{MutableVector, StringVectorBuilder}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -52,4 +53,8 @@ impl DataType for StringType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(StringVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/timestamp.rs b/src/datatypes/src/types/timestamp.rs deleted file mode 100644 index b80d16a64f..0000000000 --- a/src/datatypes/src/types/timestamp.rs +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; -use common_time::timestamp::{TimeUnit, Timestamp}; -use serde::{Deserialize, Serialize}; - -use crate::data_type::DataType; -use crate::prelude::{LogicalTypeId, MutableVector, ScalarVectorBuilder, Value}; -use crate::vectors::TimestampVectorBuilder; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct TimestampType { - pub unit: TimeUnit, -} - -impl TimestampType { - pub fn new(unit: TimeUnit) -> Self { - Self { unit } - } -} - -impl DataType for TimestampType { - fn name(&self) -> &str { - "Timestamp" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Timestamp - } - - fn default_value(&self) -> Value { - Value::Timestamp(Timestamp::new(0, self.unit)) - } - - fn as_arrow_type(&self) -> ArrowDataType { - match self.unit { - TimeUnit::Second => ArrowDataType::Timestamp(ArrowTimeUnit::Second, None), - TimeUnit::Millisecond => ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None), - TimeUnit::Microsecond => ArrowDataType::Timestamp(ArrowTimeUnit::Microsecond, None), - TimeUnit::Nanosecond => ArrowDataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - } - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(TimestampVectorBuilder::with_capacity(capacity)) - } -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::TimeUnit as ArrowTimeUnit; - use common_time::timestamp::TimeUnit::Microsecond; - - use super::*; - use crate::prelude::{ConcreteDataType, ValueRef}; - - #[test] - pub fn test_timestamp_type() { - assert_eq!( - LogicalTypeId::Timestamp, - TimestampType::new(TimeUnit::Microsecond).logical_type_id() - ); - } - - #[test] - pub fn test_as_arrow_type() { - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - TimestampType::new(TimeUnit::Nanosecond).as_arrow_type() - ); - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Microsecond, None), - TimestampType::new(TimeUnit::Microsecond).as_arrow_type() - ); - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None), - TimestampType::new(TimeUnit::Millisecond).as_arrow_type() - ); - assert_eq!( - ArrowDataType::Timestamp(ArrowTimeUnit::Second, None), - TimestampType::new(TimeUnit::Second).as_arrow_type() - ); - } - - #[test] - pub fn test_default_value() { - assert_eq!( - Value::Timestamp(Timestamp::new(0, Microsecond)), - TimestampType::new(TimeUnit::Microsecond).default_value() - ); - } - - #[test] - pub fn test_create_mutable_vector() { - let mut builder = TimestampType::new(TimeUnit::Microsecond).create_mutable_vector(10); - builder - .push_value_ref(ValueRef::Timestamp(Timestamp::new( - 42, - TimeUnit::Millisecond, - ))) - .unwrap(); - builder.push_value_ref(ValueRef::Null).unwrap(); - builder - .push_value_ref(ValueRef::Timestamp(Timestamp::new(96, TimeUnit::Second))) - .unwrap(); - let v = builder.to_vector(); - assert_eq!(ConcreteDataType::timestamp_millis_datatype(), v.data_type()); - assert_eq!(Value::Timestamp(Timestamp::from_millis(42)), v.get(0)); - assert_eq!(Value::Null, v.get(1)); - // Push a timestamp with different unit will convert the value to value with time unit millisecond. - assert_eq!(Value::Timestamp(Timestamp::from_millis(96_000)), v.get(2)); - } -} diff --git a/src/datatypes2/src/types/timestamp_type.rs b/src/datatypes/src/types/timestamp_type.rs similarity index 81% rename from src/datatypes2/src/types/timestamp_type.rs rename to src/datatypes/src/types/timestamp_type.rs index fe86eeb8fd..629d901cc8 100644 --- a/src/datatypes2/src/types/timestamp_type.rs +++ b/src/datatypes/src/types/timestamp_type.rs @@ -50,6 +50,18 @@ pub enum TimestampType { Nanosecond(TimestampNanosecondType), } +impl TimestampType { + /// Returns the [`TimeUnit`] of this type. + pub fn unit(&self) -> TimeUnit { + match self { + TimestampType::Second(_) => TimeUnit::Second, + TimestampType::Millisecond(_) => TimeUnit::Millisecond, + TimestampType::Microsecond(_) => TimeUnit::Microsecond, + TimestampType::Nanosecond(_) => TimeUnit::Nanosecond, + } + } +} + macro_rules! impl_data_type_for_timestamp { ($unit: ident) => { paste! { @@ -58,7 +70,7 @@ macro_rules! impl_data_type_for_timestamp { impl DataType for [] { fn name(&self) -> &str { - stringify!([]) + stringify!([]) } fn logical_type_id(&self) -> LogicalTypeId { @@ -82,11 +94,11 @@ macro_rules! impl_data_type_for_timestamp { } } - impl LogicalPrimitiveType for [] { type ArrowPrimitive = []; type Native = i64; type Wrapper = []; + type LargestType = Self; fn build_data_type() -> ConcreteDataType { ConcreteDataType::Timestamp(TimestampType::$unit( @@ -113,6 +125,9 @@ macro_rules! impl_data_type_for_timestamp { fn cast_value_ref(value: ValueRef) -> crate::Result> { match value { ValueRef::Null => Ok(None), + ValueRef::Int64(v) =>{ + Ok(Some([]::from(v))) + } ValueRef::Timestamp(t) => match t.unit() { TimeUnit::$unit => Ok(Some([](t))), other => error::CastTypeSnafu { @@ -138,3 +153,28 @@ impl_data_type_for_timestamp!(Nanosecond); impl_data_type_for_timestamp!(Second); impl_data_type_for_timestamp!(Millisecond); impl_data_type_for_timestamp!(Microsecond); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_timestamp_type_unit() { + assert_eq!( + TimeUnit::Second, + TimestampType::Second(TimestampSecondType).unit() + ); + assert_eq!( + TimeUnit::Millisecond, + TimestampType::Millisecond(TimestampMillisecondType).unit() + ); + assert_eq!( + TimeUnit::Microsecond, + TimestampType::Microsecond(TimestampMicrosecondType).unit() + ); + assert_eq!( + TimeUnit::Nanosecond, + TimestampType::Nanosecond(TimestampNanosecondType).unit() + ); + } +} diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index d5e0ae3e9f..457c774606 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -15,6 +15,7 @@ use std::cmp::Ordering; use std::fmt::{Display, Formatter}; +use arrow::datatypes::{DataType as ArrowDataType, Field}; use common_base::bytes::{Bytes, StringBytes}; use common_time::date::Date; use common_time::datetime::DateTime; @@ -22,10 +23,12 @@ use common_time::timestamp::{TimeUnit, Timestamp}; use datafusion_common::ScalarValue; pub use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; +use snafu::ensure; use crate::error::{self, Result}; use crate::prelude::*; use crate::type_id::LogicalTypeId; +use crate::types::ListType; use crate::vectors::ListVector; pub type OrderedF32 = OrderedFloat; @@ -125,10 +128,10 @@ impl Value { Value::Float64(_) => ConcreteDataType::float64_datatype(), Value::String(_) => ConcreteDataType::string_datatype(), Value::Binary(_) => ConcreteDataType::binary_datatype(), - Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), Value::Date(_) => ConcreteDataType::date_datatype(), Value::DateTime(_) => ConcreteDataType::datetime_datatype(), Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), + Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), } } @@ -193,9 +196,95 @@ impl Value { Value::List(_) => LogicalTypeId::List, Value::Date(_) => LogicalTypeId::Date, Value::DateTime(_) => LogicalTypeId::DateTime, - Value::Timestamp(_) => LogicalTypeId::Timestamp, + Value::Timestamp(t) => match t.unit() { + TimeUnit::Second => LogicalTypeId::TimestampSecond, + TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, + TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, + TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, + }, } } + + /// Convert the value into [`ScalarValue`] according to the `output_type`. + pub fn try_to_scalar_value(&self, output_type: &ConcreteDataType) -> Result { + // Compare logical type, since value might not contains full type information. + let value_type_id = self.logical_type_id(); + let output_type_id = output_type.logical_type_id(); + ensure!( + output_type_id == value_type_id || self.is_null(), + error::ToScalarValueSnafu { + reason: format!( + "expect value to return output_type {:?}, actual: {:?}", + output_type_id, value_type_id, + ), + } + ); + + let scalar_value = match self { + Value::Boolean(v) => ScalarValue::Boolean(Some(*v)), + Value::UInt8(v) => ScalarValue::UInt8(Some(*v)), + Value::UInt16(v) => ScalarValue::UInt16(Some(*v)), + Value::UInt32(v) => ScalarValue::UInt32(Some(*v)), + Value::UInt64(v) => ScalarValue::UInt64(Some(*v)), + Value::Int8(v) => ScalarValue::Int8(Some(*v)), + Value::Int16(v) => ScalarValue::Int16(Some(*v)), + Value::Int32(v) => ScalarValue::Int32(Some(*v)), + Value::Int64(v) => ScalarValue::Int64(Some(*v)), + Value::Float32(v) => ScalarValue::Float32(Some(v.0)), + Value::Float64(v) => ScalarValue::Float64(Some(v.0)), + Value::String(v) => ScalarValue::Utf8(Some(v.as_utf8().to_string())), + Value::Binary(v) => ScalarValue::LargeBinary(Some(v.to_vec())), + Value::Date(v) => ScalarValue::Date32(Some(v.val())), + Value::DateTime(v) => ScalarValue::Date64(Some(v.val())), + Value::Null => to_null_value(output_type), + Value::List(list) => { + // Safety: The logical type of the value and output_type are the same. + let list_type = output_type.as_list().unwrap(); + list.try_to_scalar_value(list_type)? + } + Value::Timestamp(t) => timestamp_to_scalar_value(t.unit(), Some(t.value())), + }; + + Ok(scalar_value) + } +} + +fn to_null_value(output_type: &ConcreteDataType) -> ScalarValue { + match output_type { + ConcreteDataType::Null(_) => ScalarValue::Null, + ConcreteDataType::Boolean(_) => ScalarValue::Boolean(None), + ConcreteDataType::Int8(_) => ScalarValue::Int8(None), + ConcreteDataType::Int16(_) => ScalarValue::Int16(None), + ConcreteDataType::Int32(_) => ScalarValue::Int32(None), + ConcreteDataType::Int64(_) => ScalarValue::Int64(None), + ConcreteDataType::UInt8(_) => ScalarValue::UInt8(None), + ConcreteDataType::UInt16(_) => ScalarValue::UInt16(None), + ConcreteDataType::UInt32(_) => ScalarValue::UInt32(None), + ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None), + ConcreteDataType::Float32(_) => ScalarValue::Float32(None), + ConcreteDataType::Float64(_) => ScalarValue::Float64(None), + ConcreteDataType::Binary(_) => ScalarValue::LargeBinary(None), + ConcreteDataType::String(_) => ScalarValue::Utf8(None), + ConcreteDataType::Date(_) => ScalarValue::Date32(None), + ConcreteDataType::DateTime(_) => ScalarValue::Date64(None), + ConcreteDataType::Timestamp(t) => timestamp_to_scalar_value(t.unit(), None), + ConcreteDataType::List(_) => { + ScalarValue::List(None, Box::new(new_item_field(output_type.as_arrow_type()))) + } + } +} + +fn new_item_field(data_type: ArrowDataType) -> Field { + Field::new("item", data_type, false) +} + +fn timestamp_to_scalar_value(unit: TimeUnit, val: Option) -> ScalarValue { + match unit { + TimeUnit::Second => ScalarValue::TimestampSecond(val, None), + TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(val, None), + TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(val, None), + TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(val, None), + } } macro_rules! impl_ord_for_value_like { @@ -277,6 +366,9 @@ impl_value_from!(Float32, f32); impl_value_from!(Float64, f64); impl_value_from!(String, StringBytes); impl_value_from!(Binary, Bytes); +impl_value_from!(Date, Date); +impl_value_from!(DateTime, DateTime); +impl_value_from!(Timestamp, Timestamp); impl From for Value { fn from(string: String) -> Value { @@ -296,12 +388,6 @@ impl From> for Value { } } -impl From for Value { - fn from(v: Timestamp) -> Self { - Value::Timestamp(v) - } -} - impl From<&[u8]> for Value { fn from(bytes: &[u8]) -> Value { Value::Binary(bytes.into()) @@ -337,6 +423,7 @@ impl TryFrom for serde_json::Value { } } +// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. /// List value. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ListValue { @@ -362,6 +449,24 @@ impl ListValue { pub fn datatype(&self) -> &ConcreteDataType { &self.datatype } + + fn try_to_scalar_value(&self, output_type: &ListType) -> Result { + let vs = if let Some(items) = self.items() { + Some( + items + .iter() + .map(|v| v.try_to_scalar_value(output_type.item_type())) + .collect::>>()?, + ) + } else { + None + }; + + Ok(ScalarValue::List( + vs, + Box::new(new_item_field(output_type.item_type().as_arrow_type())), + )) + } } impl Default for ListValue { @@ -391,6 +496,7 @@ impl TryFrom for Value { fn try_from(v: ScalarValue) -> Result { let v = match v { + ScalarValue::Null => Value::Null, ScalarValue::Boolean(b) => Value::from(b), ScalarValue::Float32(f) => Value::from(f), ScalarValue::Float64(f) => Value::from(f), @@ -405,8 +511,10 @@ impl TryFrom for Value { ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { Value::from(s.map(StringBytes::from)) } - ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)), - ScalarValue::List(vs, t) => { + ScalarValue::Binary(b) + | ScalarValue::LargeBinary(b) + | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), + ScalarValue::List(vs, field) => { let items = if let Some(vs) = vs { let vs = vs .into_iter() @@ -416,7 +524,7 @@ impl TryFrom for Value { } else { None }; - let datatype = t.as_ref().try_into()?; + let datatype = ConcreteDataType::try_from(field.data_type())?; Value::List(ListValue::new(items, datatype)) } ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), @@ -435,7 +543,13 @@ impl TryFrom for Value { ScalarValue::TimestampNanosecond(t, _) => t .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) .unwrap_or(Value::Null), - _ => { + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { return error::UnsupportedArrowTypeSnafu { arrow_type: v.get_datatype(), } @@ -545,15 +659,6 @@ impl<'a> Ord for ValueRef<'a> { } } -/// A helper trait to convert copyable types to `ValueRef`. -/// -/// It could replace the usage of `Into>`, thus avoid confusion between `Into` -/// and `Into>` in generic codes. One typical usage is the [`Primitive`](crate::primitive_traits::Primitive) trait. -pub trait IntoValueRef<'a> { - /// Convert itself to [ValueRef]. - fn into_value_ref(self) -> ValueRef<'a>; -} - macro_rules! impl_value_ref_from { ($Variant:ident, $Type:ident) => { impl From<$Type> for ValueRef<'_> { @@ -562,12 +667,6 @@ macro_rules! impl_value_ref_from { } } - impl<'a> IntoValueRef<'a> for $Type { - fn into_value_ref(self) -> ValueRef<'a> { - ValueRef::$Variant(self.into()) - } - } - impl From> for ValueRef<'_> { fn from(value: Option<$Type>) -> Self { match value { @@ -576,15 +675,6 @@ macro_rules! impl_value_ref_from { } } } - - impl<'a> IntoValueRef<'a> for Option<$Type> { - fn into_value_ref(self) -> ValueRef<'a> { - match self { - Some(v) => ValueRef::$Variant(v.into()), - None => ValueRef::Null, - } - } - } }; } @@ -599,6 +689,9 @@ impl_value_ref_from!(Int32, i32); impl_value_ref_from!(Int64, i64); impl_value_ref_from!(Float32, f32); impl_value_ref_from!(Float64, f64); +impl_value_ref_from!(Date, Date); +impl_value_ref_from!(DateTime, DateTime); +impl_value_ref_from!(Timestamp, Timestamp); impl<'a> From<&'a str> for ValueRef<'a> { fn from(string: &'a str) -> ValueRef<'a> { @@ -628,6 +721,7 @@ impl<'a> From>> for ValueRef<'a> { /// if it becomes bottleneck. #[derive(Debug, Clone, Copy)] pub enum ListValueRef<'a> { + // TODO(yingwen): Consider replace this by VectorRef. Indexed { vector: &'a ListVector, idx: usize }, Ref { val: &'a ListValue }, } @@ -785,19 +879,16 @@ mod tests { Some(Box::new(vec![Value::Int32(1), Value::Null])), ConcreteDataType::int32_datatype() )), - ScalarValue::List( - Some(Box::new(vec![ - ScalarValue::Int32(Some(1)), - ScalarValue::Int32(None) - ])), - Box::new(ArrowDataType::Int32) + ScalarValue::new_list( + Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), + ArrowDataType::Int32, ) .try_into() .unwrap() ); assert_eq!( Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), - ScalarValue::List(None, Box::new(ArrowDataType::UInt32)) + ScalarValue::new_list(None, ArrowDataType::UInt32) .try_into() .unwrap() ); @@ -980,6 +1071,10 @@ mod tests { ConcreteDataType::int32_datatype(), )), ); + check_type_and_value( + &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), + &Value::List(ListValue::default()), + ); check_type_and_value( &ConcreteDataType::date_datatype(), &Value::Date(Date::new(1)), @@ -989,8 +1084,8 @@ mod tests { &Value::DateTime(DateTime::new(1)), ); check_type_and_value( - &ConcreteDataType::timestamp_millis_datatype(), - &Value::Timestamp(Timestamp::from_millis(1)), + &ConcreteDataType::timestamp_millisecond_datatype(), + &Value::Timestamp(Timestamp::new_millisecond(1)), ); } @@ -1085,7 +1180,7 @@ mod tests { assert_eq!( serde_json::Value::Number(1.into()), - to_json(Value::Timestamp(Timestamp::from_millis(1))) + to_json(Value::Timestamp(Timestamp::new_millisecond(1))) ); let json_value: serde_json::Value = @@ -1143,7 +1238,7 @@ mod tests { check_as_value_ref!(Int64, -12); check_as_value_ref!(Float32, OrderedF32::from(16.0)); check_as_value_ref!(Float64, OrderedF64::from(16.0)); - check_as_value_ref!(Timestamp, Timestamp::from_millis(1)); + check_as_value_ref!(Timestamp, Timestamp::new_millisecond(1)); assert_eq!( ValueRef::String("hello"), @@ -1208,59 +1303,6 @@ mod tests { assert!(wrong_value.as_list().is_err()); } - #[test] - fn test_into_value_ref() { - macro_rules! check_into_value_ref { - ($Variant: ident, $data: expr, $PrimitiveType: ident, $Wrapper: ident) => { - let data: $PrimitiveType = $data; - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - data.into_value_ref() - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - ValueRef::from(data) - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - Some(data).into_value_ref() - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - ValueRef::from(Some(data)) - ); - let x: Option<$PrimitiveType> = None; - assert_eq!(ValueRef::Null, x.into_value_ref()); - assert_eq!(ValueRef::Null, x.into()); - }; - } - - macro_rules! check_primitive_into_value_ref { - ($Variant: ident, $data: expr, $PrimitiveType: ident) => { - check_into_value_ref!($Variant, $data, $PrimitiveType, $PrimitiveType) - }; - } - - check_primitive_into_value_ref!(Boolean, true, bool); - check_primitive_into_value_ref!(UInt8, 10, u8); - check_primitive_into_value_ref!(UInt16, 20, u16); - check_primitive_into_value_ref!(UInt32, 30, u32); - check_primitive_into_value_ref!(UInt64, 40, u64); - check_primitive_into_value_ref!(Int8, -10, i8); - check_primitive_into_value_ref!(Int16, -20, i16); - check_primitive_into_value_ref!(Int32, -30, i32); - check_primitive_into_value_ref!(Int64, -40, i64); - check_into_value_ref!(Float32, 10.0, f32, OrderedF32); - check_into_value_ref!(Float64, 10.0, f64, OrderedF64); - - let hello = "hello"; - assert_eq!( - ValueRef::Binary(hello.as_bytes()), - ValueRef::from(hello.as_bytes()) - ); - assert_eq!(ValueRef::String(hello), ValueRef::from(hello)); - } - #[test] fn test_display() { assert_eq!(Value::Null.to_string(), "Null"); @@ -1301,10 +1343,248 @@ mod tests { assert_eq!( Value::List(ListValue::new( Some(Box::new(vec![])), - ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond), + ConcreteDataType::timestamp_second_datatype(), )) .to_string(), - "Timestamp[]" + "TimestampSecond[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_millisecond_datatype(), + )) + .to_string(), + "TimestampMillisecond[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_microsecond_datatype(), + )) + .to_string(), + "TimestampMicrosecond[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_nanosecond_datatype(), + )) + .to_string(), + "TimestampNanosecond[]" + ); + } + + #[test] + fn test_not_null_value_to_scalar_value() { + assert_eq!( + ScalarValue::Boolean(Some(true)), + Value::Boolean(true) + .try_to_scalar_value(&ConcreteDataType::boolean_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Boolean(Some(false)), + Value::Boolean(false) + .try_to_scalar_value(&ConcreteDataType::boolean_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt8(Some(u8::MIN + 1)), + Value::UInt8(u8::MIN + 1) + .try_to_scalar_value(&ConcreteDataType::uint8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt16(Some(u16::MIN + 2)), + Value::UInt16(u16::MIN + 2) + .try_to_scalar_value(&ConcreteDataType::uint16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt32(Some(u32::MIN + 3)), + Value::UInt32(u32::MIN + 3) + .try_to_scalar_value(&ConcreteDataType::uint32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt64(Some(u64::MIN + 4)), + Value::UInt64(u64::MIN + 4) + .try_to_scalar_value(&ConcreteDataType::uint64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int8(Some(i8::MIN + 4)), + Value::Int8(i8::MIN + 4) + .try_to_scalar_value(&ConcreteDataType::int8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int16(Some(i16::MIN + 5)), + Value::Int16(i16::MIN + 5) + .try_to_scalar_value(&ConcreteDataType::int16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int32(Some(i32::MIN + 6)), + Value::Int32(i32::MIN + 6) + .try_to_scalar_value(&ConcreteDataType::int32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int64(Some(i64::MIN + 7)), + Value::Int64(i64::MIN + 7) + .try_to_scalar_value(&ConcreteDataType::int64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float32(Some(8.0f32)), + Value::Float32(OrderedFloat(8.0f32)) + .try_to_scalar_value(&ConcreteDataType::float32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float64(Some(9.0f64)), + Value::Float64(OrderedFloat(9.0f64)) + .try_to_scalar_value(&ConcreteDataType::float64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Utf8(Some("hello".to_string())), + Value::String(StringBytes::from("hello")) + .try_to_scalar_value(&ConcreteDataType::string_datatype(),) + .unwrap() + ); + assert_eq!( + ScalarValue::LargeBinary(Some("world".as_bytes().to_vec())), + Value::Binary(Bytes::from("world".as_bytes())) + .try_to_scalar_value(&ConcreteDataType::binary_datatype()) + .unwrap() + ); + } + + #[test] + fn test_null_value_to_scalar_value() { + assert_eq!( + ScalarValue::Boolean(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::boolean_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt8(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt16(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt32(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::UInt64(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::uint64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int8(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int8_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int16(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int16_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int32(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Int64(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::int64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float32(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::float32_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Float64(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::float64_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::Utf8(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::string_datatype()) + .unwrap() + ); + assert_eq!( + ScalarValue::LargeBinary(None), + Value::Null + .try_to_scalar_value(&ConcreteDataType::binary_datatype()) + .unwrap() + ); + } + + #[test] + fn test_list_value_to_scalar_value() { + let items = Some(Box::new(vec![Value::Int32(-1), Value::Null])); + let list = Value::List(ListValue::new(items, ConcreteDataType::int32_datatype())); + let df_list = list + .try_to_scalar_value(&ConcreteDataType::list_datatype( + ConcreteDataType::int32_datatype(), + )) + .unwrap(); + assert!(matches!(df_list, ScalarValue::List(_, _))); + match df_list { + ScalarValue::List(vs, field) => { + assert_eq!(ArrowDataType::Int32, *field.data_type()); + + let vs = vs.unwrap(); + assert_eq!( + vs, + vec![ScalarValue::Int32(Some(-1)), ScalarValue::Int32(None)] + ); + } + _ => unreachable!(), + } + } + + #[test] + fn test_timestamp_to_scalar_value() { + assert_eq!( + ScalarValue::TimestampSecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Second, Some(1)) + ); + assert_eq!( + ScalarValue::TimestampMillisecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Millisecond, Some(1)) + ); + assert_eq!( + ScalarValue::TimestampMicrosecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Microsecond, Some(1)) + ); + assert_eq!( + ScalarValue::TimestampNanosecond(Some(1), None), + timestamp_to_scalar_value(TimeUnit::Nanosecond, Some(1)) ); } } diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index 6c9402849f..fe71a6a7c3 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -12,68 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod binary; -pub mod boolean; -mod builder; -pub mod constant; -pub mod date; -pub mod datetime; -mod eq; -mod helper; -mod list; -pub mod mutable; -pub mod null; -mod operations; -pub mod primitive; -mod string; -mod timestamp; - use std::any::Any; use std::fmt::Debug; use std::sync::Arc; use arrow::array::{Array, ArrayRef}; -use arrow::bitmap::Bitmap; -pub use binary::*; -pub use boolean::*; -pub use builder::VectorBuilder; -pub use constant::*; -pub use date::*; -pub use datetime::*; -pub use helper::Helper; -pub use list::*; -pub use mutable::MutableVector; -pub use null::*; -pub use operations::VectorOp; -pub use primitive::*; use snafu::ensure; -pub use string::*; -pub use timestamp::*; use crate::data_type::ConcreteDataType; use crate::error::{self, Result}; use crate::serialize::Serializable; use crate::value::{Value, ValueRef}; +use crate::vectors::operations::VectorOp; -#[derive(Debug, PartialEq)] -pub enum Validity<'a> { - /// Whether the array slot is valid or not (null). - Slots(&'a Bitmap), - /// All slots are valid. - AllValid, - /// All slots are null. - AllNull, -} +mod binary; +mod boolean; +mod constant; +mod date; +mod datetime; +mod eq; +mod helper; +mod list; +mod null; +mod operations; +mod primitive; +mod string; +mod timestamp; +mod validity; -impl<'a> Validity<'a> { - pub fn slots(&self) -> Option<&Bitmap> { - match self { - Validity::Slots(bitmap) => Some(bitmap), - _ => None, - } - } -} +pub use binary::{BinaryVector, BinaryVectorBuilder}; +pub use boolean::{BooleanVector, BooleanVectorBuilder}; +pub use constant::ConstantVector; +pub use date::{DateVector, DateVectorBuilder}; +pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; +pub use helper::Helper; +pub use list::{ListIter, ListVector, ListVectorBuilder}; +pub use null::{NullVector, NullVectorBuilder}; +pub use primitive::{ + Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, + Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, + Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, + UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, +}; +pub use string::{StringVector, StringVectorBuilder}; +pub use timestamp::{ + TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, +}; +pub use validity::Validity; +// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify +// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. /// Vector of data values. pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// Returns the data type of the vector. @@ -110,13 +101,7 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// The number of null slots on this [`Vector`]. /// # Implementation /// This is `O(1)`. - fn null_count(&self) -> usize { - match self.validity() { - Validity::Slots(bitmap) => bitmap.null_count(), - Validity::AllValid => 0, - Validity::AllNull => self.len(), - } - } + fn null_count(&self) -> usize; /// Returns true when it's a ConstantColumn fn is_const(&self) -> bool { @@ -165,6 +150,42 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { pub type VectorRef = Arc; +/// Mutable vector that could be used to build an immutable vector. +pub trait MutableVector: Send + Sync { + /// Returns the data type of the vector. + fn data_type(&self) -> ConcreteDataType; + + /// Returns the length of the vector. + fn len(&self) -> usize; + + /// Returns whether the vector is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Convert to Any, to enable dynamic casting. + fn as_any(&self) -> &dyn Any; + + /// Convert to mutable Any, to enable dynamic casting. + fn as_mut_any(&mut self) -> &mut dyn Any; + + /// Convert `self` to an (immutable) [VectorRef] and reset `self`. + fn to_vector(&mut self) -> VectorRef; + + /// Push value ref to this mutable vector. + /// + /// Returns error if data types mismatch. + fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; + + /// Extend this mutable vector by slice of `vector`. + /// + /// Returns error if data types mismatch. + /// + /// # Panics + /// Panics if `offset + length > vector.len()`. + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; +} + /// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. macro_rules! impl_try_from_arrow_array_for_vector { ($Array: ident, $Vector: ident) => { @@ -172,16 +193,20 @@ macro_rules! impl_try_from_arrow_array_for_vector { pub fn try_from_arrow_array( array: impl AsRef, ) -> crate::error::Result<$Vector> { - Ok($Vector::from( - array - .as_ref() - .as_any() - .downcast_ref::<$Array>() - .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) + use snafu::OptionExt; + + let data = array + .as_ref() + .as_any() + .downcast_ref::<$Array>() + .with_context(|| crate::error::ConversionSnafu { + from: std::format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + + let concrete_array = $Array::from(data); + Ok($Vector::from(concrete_array)) } } }; @@ -189,10 +214,7 @@ macro_rules! impl_try_from_arrow_array_for_vector { macro_rules! impl_validity_for_vector { ($array: expr) => { - match $array.validity() { - Some(bitmap) => Validity::Slots(bitmap), - None => Validity::AllValid, - } + Validity::from_array_data($array.data()) }; } @@ -219,10 +241,11 @@ macro_rules! impl_get_ref_for_vector { } macro_rules! impl_extend_for_builder { - ($mutable_array: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ + ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ use snafu::OptionExt; - let concrete_vector = $vector + let sliced_vector = $vector.slice($offset, $length); + let concrete_vector = sliced_vector .as_any() .downcast_ref::<$VectorType>() .with_context(|| crate::error::CastTypeSnafu { @@ -232,8 +255,9 @@ macro_rules! impl_extend_for_builder { stringify!($VectorType) ), })?; - let slice = concrete_vector.array.slice($offset, $length); - $mutable_array.extend_trusted_len(slice.iter()); + for value in concrete_vector.iter_data() { + $mutable_vector.push(value); + } Ok(()) }}; } @@ -245,27 +269,27 @@ pub(crate) use { #[cfg(test)] pub mod tests { - use arrow::array::{Array, PrimitiveArray}; + use arrow::array::{Array, Int32Array, UInt8Array}; use serde_json; - use super::helper::Helper; use super::*; use crate::data_type::DataType; - use crate::types::PrimitiveElement; + use crate::types::{Int32Type, LogicalPrimitiveType}; + use crate::vectors::helper::Helper; #[test] fn test_df_columns_to_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3])); + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); let vector = Helper::try_into_vector(df_column).unwrap(); assert_eq!( - i32::build_data_type().as_arrow_type(), + Int32Type::build_data_type().as_arrow_type(), vector.data_type().as_arrow_type() ); } #[test] fn test_serialize_i32_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::::from_slice(vec![1, 2, 3])); + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() @@ -275,7 +299,7 @@ pub mod tests { #[test] fn test_serialize_i8_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8])); + let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() diff --git a/src/datatypes/src/vectors/binary.rs b/src/datatypes/src/vectors/binary.rs index 817b29bca0..3b5defc8ec 100644 --- a/src/datatypes/src/vectors/binary.rs +++ b/src/datatypes/src/vectors/binary.rs @@ -15,9 +15,8 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, BinaryValueIter, MutableArray}; -use arrow::bitmap::utils::ZipValidity; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; use crate::arrow_array::{BinaryArray, MutableBinaryArray}; use crate::data_type::ConcreteDataType; @@ -37,6 +36,16 @@ impl BinaryVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BinaryVector { + BinaryVector { + array: BinaryArray::from(data), + } + } } impl From for BinaryVector { @@ -48,7 +57,7 @@ impl From for BinaryVector { impl From>>> for BinaryVector { fn from(data: Vec>>) -> Self { Self { - array: BinaryArray::from(data), + array: BinaryArray::from_iter(data), } } } @@ -71,11 +80,13 @@ impl Vector for BinaryVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(BinaryArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(BinaryArray::from(data)) } fn validity(&self) -> Validity { @@ -83,7 +94,11 @@ impl Vector for BinaryVector { } fn memory_size(&self) -> usize { - self.array.values().len() + self.array.offsets().len() * std::mem::size_of::() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -91,7 +106,8 @@ impl Vector for BinaryVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -106,7 +122,7 @@ impl Vector for BinaryVector { impl ScalarVector for BinaryVector { type OwnedItem = Vec; type RefItem<'a> = &'a [u8]; - type Iter<'a> = ZipValidity<'a, &'a [u8], BinaryValueIter<'a, i64>>; + type Iter<'a> = ArrayIter<&'a BinaryArray>; type Builder = BinaryVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -148,12 +164,15 @@ impl MutableVector for BinaryVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.mutable_array.push(value.as_binary()?); + match value.as_binary()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.mutable_array, vector, BinaryVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) } } @@ -162,17 +181,20 @@ impl ScalarVectorBuilder for BinaryVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBinaryArray::with_capacity(capacity), + mutable_array: MutableBinaryArray::with_capacity(capacity, 0), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { BinaryVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } @@ -205,14 +227,17 @@ mod tests { #[test] fn test_binary_vector_misc() { - let v = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); + let v = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); assert_eq!(2, v.len()); assert_eq!("BinaryVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(30, v.memory_size()); + assert_eq!(128, v.memory_size()); for i in 0..2 { assert!(!v.is_null(i)); @@ -227,7 +252,10 @@ mod tests { #[test] fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); + let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); let json_value = vector.serialize_to_json().unwrap(); assert_eq!( @@ -253,8 +281,8 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]); - let original = arrow_array.clone(); + let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); + let original = BinaryArray::from(arrow_array.data().clone()); let vector = BinaryVector::from(arrow_array); assert_eq!(original, vector.array); } @@ -289,7 +317,7 @@ mod tests { builder.push(Some(b"world")); let vector = builder.finish(); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); let mut builder = BinaryVectorBuilder::with_capacity(3); builder.push(Some(b"hello")); @@ -298,9 +326,10 @@ mod tests { let vector = builder.finish(); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(1, slots.null_count()); - assert!(!slots.get_bit(1)); + assert!(!validity.is_set(1)); + + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); } #[test] diff --git a/src/datatypes/src/vectors/boolean.rs b/src/datatypes/src/vectors/boolean.rs index 11c40bd661..2b4e5b8e10 100644 --- a/src/datatypes/src/vectors/boolean.rs +++ b/src/datatypes/src/vectors/boolean.rs @@ -16,9 +16,10 @@ use std::any::Any; use std::borrow::Borrow; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray}; -use arrow::bitmap::utils::{BitmapIter, ZipValidity}; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, +}; +use snafu::ResultExt; use crate::data_type::ConcreteDataType; use crate::error::Result; @@ -41,12 +42,26 @@ impl BooleanVector { pub(crate) fn as_boolean_array(&self) -> &BooleanArray { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BooleanVector { + BooleanVector { + array: BooleanArray::from(data), + } + } + + pub(crate) fn false_count(&self) -> usize { + self.array.false_count() + } } impl From> for BooleanVector { fn from(data: Vec) -> Self { BooleanVector { - array: BooleanArray::from_slice(&data), + array: BooleanArray::from(data), } } } @@ -91,11 +106,13 @@ impl Vector for BooleanVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(BooleanArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(BooleanArray::from(data)) } fn validity(&self) -> Validity { @@ -103,7 +120,11 @@ impl Vector for BooleanVector { } fn memory_size(&self) -> usize { - self.array.values().as_slice().0.len() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -111,7 +132,8 @@ impl Vector for BooleanVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -126,7 +148,7 @@ impl Vector for BooleanVector { impl ScalarVector for BooleanVector { type OwnedItem = bool; type RefItem<'a> = bool; - type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>; + type Iter<'a> = ArrayIter<&'a BooleanArray>; type Builder = BooleanVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -143,7 +165,7 @@ impl ScalarVector for BooleanVector { } pub struct BooleanVectorBuilder { - mutable_array: MutableBooleanArray, + mutable_array: BooleanBuilder, } impl MutableVector for BooleanVectorBuilder { @@ -168,12 +190,15 @@ impl MutableVector for BooleanVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.mutable_array.push(value.as_boolean()?); + match value.as_boolean()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.mutable_array, vector, BooleanVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) } } @@ -182,17 +207,20 @@ impl ScalarVectorBuilder for BooleanVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBooleanArray::with_capacity(capacity), + mutable_array: BooleanBuilder::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { BooleanVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } @@ -225,9 +253,9 @@ mod tests { assert_eq!(9, v.len()); assert_eq!("BooleanVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(2, v.memory_size()); + assert_eq!(64, v.memory_size()); for (i, b) in bools.iter().enumerate() { assert!(!v.is_null(i)); @@ -316,13 +344,12 @@ mod tests { let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(1, slots.null_count()); - assert!(!slots.get_bit(1)); + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); let vector = BooleanVector::from(vec![true, false, false]); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); } #[test] diff --git a/src/datatypes/src/vectors/builder.rs b/src/datatypes/src/vectors/builder.rs deleted file mode 100644 index 67ab2513ab..0000000000 --- a/src/datatypes/src/vectors/builder.rs +++ /dev/null @@ -1,494 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use common_time::date::Date; -use common_time::datetime::DateTime; -use common_time::timestamp::Timestamp; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::ValueRef; -use crate::scalars::ScalarVectorBuilder; -use crate::value::Value; -use crate::vectors::date::DateVectorBuilder; -use crate::vectors::datetime::DateTimeVectorBuilder; -use crate::vectors::{ - BinaryVectorBuilder, BooleanVectorBuilder, Float32VectorBuilder, Float64VectorBuilder, - Int16VectorBuilder, Int32VectorBuilder, Int64VectorBuilder, Int8VectorBuilder, MutableVector, - NullVector, StringVectorBuilder, TimestampVectorBuilder, UInt16VectorBuilder, - UInt32VectorBuilder, UInt64VectorBuilder, UInt8VectorBuilder, VectorRef, -}; - -pub enum VectorBuilder { - Null(usize), - - // Numeric types: - Boolean(BooleanVectorBuilder), - UInt8(UInt8VectorBuilder), - UInt16(UInt16VectorBuilder), - UInt32(UInt32VectorBuilder), - UInt64(UInt64VectorBuilder), - Int8(Int8VectorBuilder), - Int16(Int16VectorBuilder), - Int32(Int32VectorBuilder), - Int64(Int64VectorBuilder), - Float32(Float32VectorBuilder), - Float64(Float64VectorBuilder), - - // String types: - String(StringVectorBuilder), - Binary(BinaryVectorBuilder), - - Date(DateVectorBuilder), - DateTime(DateTimeVectorBuilder), - Timestamp(TimestampVectorBuilder), -} - -impl VectorBuilder { - pub fn new(data_type: ConcreteDataType) -> VectorBuilder { - VectorBuilder::with_capacity(data_type, 0) - } - - pub fn with_capacity(data_type: ConcreteDataType, capacity: usize) -> VectorBuilder { - match data_type { - ConcreteDataType::Null(_) => VectorBuilder::Null(0), - ConcreteDataType::Boolean(_) => { - VectorBuilder::Boolean(BooleanVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt8(_) => { - VectorBuilder::UInt8(UInt8VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt16(_) => { - VectorBuilder::UInt16(UInt16VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt32(_) => { - VectorBuilder::UInt32(UInt32VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::UInt64(_) => { - VectorBuilder::UInt64(UInt64VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int8(_) => { - VectorBuilder::Int8(Int8VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int16(_) => { - VectorBuilder::Int16(Int16VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int32(_) => { - VectorBuilder::Int32(Int32VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Int64(_) => { - VectorBuilder::Int64(Int64VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Float32(_) => { - VectorBuilder::Float32(Float32VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Float64(_) => { - VectorBuilder::Float64(Float64VectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::String(_) => { - VectorBuilder::String(StringVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Binary(_) => { - VectorBuilder::Binary(BinaryVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Date(_) => { - VectorBuilder::Date(DateVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::DateTime(_) => { - VectorBuilder::DateTime(DateTimeVectorBuilder::with_capacity(capacity)) - } - ConcreteDataType::Timestamp(_) => { - VectorBuilder::Timestamp(TimestampVectorBuilder::with_capacity(capacity)) - } - _ => unimplemented!(), - } - } - - pub fn data_type(&self) -> ConcreteDataType { - match self { - VectorBuilder::Null(_) => ConcreteDataType::null_datatype(), - VectorBuilder::Boolean(b) => b.data_type(), - VectorBuilder::UInt8(b) => b.data_type(), - VectorBuilder::UInt16(b) => b.data_type(), - VectorBuilder::UInt32(b) => b.data_type(), - VectorBuilder::UInt64(b) => b.data_type(), - VectorBuilder::Int8(b) => b.data_type(), - VectorBuilder::Int16(b) => b.data_type(), - VectorBuilder::Int32(b) => b.data_type(), - VectorBuilder::Int64(b) => b.data_type(), - VectorBuilder::Float32(b) => b.data_type(), - VectorBuilder::Float64(b) => b.data_type(), - VectorBuilder::String(b) => b.data_type(), - VectorBuilder::Binary(b) => b.data_type(), - VectorBuilder::Date(b) => b.data_type(), - VectorBuilder::DateTime(b) => b.data_type(), - VectorBuilder::Timestamp(b) => b.data_type(), - } - } - - pub fn push(&mut self, value: &Value) { - if value.is_null() { - self.push_null(); - return; - } - - match (&mut *self, value) { - (VectorBuilder::Boolean(b), Value::Boolean(v)) => b.push(Some(*v)), - (VectorBuilder::UInt8(b), Value::UInt8(v)) => b.push(Some(*v)), - (VectorBuilder::UInt16(b), Value::UInt16(v)) => b.push(Some(*v)), - (VectorBuilder::UInt32(b), Value::UInt32(v)) => b.push(Some(*v)), - (VectorBuilder::UInt64(b), Value::UInt64(v)) => b.push(Some(*v)), - (VectorBuilder::Int8(b), Value::Int8(v)) => b.push(Some(*v)), - (VectorBuilder::Int16(b), Value::Int16(v)) => b.push(Some(*v)), - (VectorBuilder::Int32(b), Value::Int32(v)) => b.push(Some(*v)), - (VectorBuilder::Int64(b), Value::Int64(v)) => b.push(Some(*v)), - (VectorBuilder::Float32(b), Value::Float32(v)) => b.push(Some(v.into_inner())), - (VectorBuilder::Float64(b), Value::Float64(v)) => b.push(Some(v.into_inner())), - (VectorBuilder::String(b), Value::String(v)) => b.push(Some(v.as_utf8())), - (VectorBuilder::Binary(b), Value::Binary(v)) => b.push(Some(v)), - (VectorBuilder::Date(b), Value::Date(v)) => b.push(Some(*v)), - (VectorBuilder::Date(b), Value::Int32(v)) => b.push(Some(Date::new(*v))), - (VectorBuilder::DateTime(b), Value::DateTime(v)) => b.push(Some(*v)), - (VectorBuilder::DateTime(b), Value::Int64(v)) => b.push(Some(DateTime::new(*v))), - (VectorBuilder::Timestamp(b), Value::Timestamp(t)) => b.push(Some(*t)), - (VectorBuilder::Timestamp(b), Value::Int64(v)) => { - b.push(Some(Timestamp::from_millis(*v))) - } - - _ => panic!( - "Value {:?} does not match builder type {:?}", - value, - self.data_type() - ), - } - } - - pub fn try_push_ref(&mut self, value: ValueRef) -> Result<()> { - match &mut *self { - VectorBuilder::Null(b) => { - if !value.is_null() { - return error::CastTypeSnafu { - msg: "unable to accept non-null value in NullVectorBuilder", - } - .fail(); - } - *b += 1; - Ok(()) - } - VectorBuilder::Boolean(b) => b.push_value_ref(value), - VectorBuilder::UInt8(b) => b.push_value_ref(value), - VectorBuilder::UInt16(b) => b.push_value_ref(value), - VectorBuilder::UInt32(b) => b.push_value_ref(value), - VectorBuilder::UInt64(b) => b.push_value_ref(value), - VectorBuilder::Int8(b) => b.push_value_ref(value), - VectorBuilder::Int16(b) => b.push_value_ref(value), - VectorBuilder::Int32(b) => b.push_value_ref(value), - VectorBuilder::Int64(b) => b.push_value_ref(value), - VectorBuilder::Float32(b) => b.push_value_ref(value), - VectorBuilder::Float64(b) => b.push_value_ref(value), - VectorBuilder::String(b) => b.push_value_ref(value), - VectorBuilder::Binary(b) => b.push_value_ref(value), - VectorBuilder::Date(b) => b.push_value_ref(value), - VectorBuilder::DateTime(b) => b.push_value_ref(value), - VectorBuilder::Timestamp(b) => b.push_value_ref(value), - } - } - - pub fn push_null(&mut self) { - match self { - VectorBuilder::Null(v) => *v += 1, - VectorBuilder::Boolean(b) => b.push(None), - VectorBuilder::UInt8(b) => b.push(None), - VectorBuilder::UInt16(b) => b.push(None), - VectorBuilder::UInt32(b) => b.push(None), - VectorBuilder::UInt64(b) => b.push(None), - VectorBuilder::Int8(b) => b.push(None), - VectorBuilder::Int16(b) => b.push(None), - VectorBuilder::Int32(b) => b.push(None), - VectorBuilder::Int64(b) => b.push(None), - VectorBuilder::Float32(b) => b.push(None), - VectorBuilder::Float64(b) => b.push(None), - VectorBuilder::String(b) => b.push(None), - VectorBuilder::Binary(b) => b.push(None), - VectorBuilder::Date(b) => b.push(None), - VectorBuilder::DateTime(b) => b.push(None), - VectorBuilder::Timestamp(b) => b.push(None), - } - } - - pub fn finish(&mut self) -> VectorRef { - match self { - VectorBuilder::Null(v) => Arc::new(NullVector::new(*v)), - VectorBuilder::Boolean(b) => Arc::new(b.finish()), - VectorBuilder::UInt8(b) => Arc::new(b.finish()), - VectorBuilder::UInt16(b) => Arc::new(b.finish()), - VectorBuilder::UInt32(b) => Arc::new(b.finish()), - VectorBuilder::UInt64(b) => Arc::new(b.finish()), - VectorBuilder::Int8(b) => Arc::new(b.finish()), - VectorBuilder::Int16(b) => Arc::new(b.finish()), - VectorBuilder::Int32(b) => Arc::new(b.finish()), - VectorBuilder::Int64(b) => Arc::new(b.finish()), - VectorBuilder::Float32(b) => Arc::new(b.finish()), - VectorBuilder::Float64(b) => Arc::new(b.finish()), - VectorBuilder::String(b) => Arc::new(b.finish()), - VectorBuilder::Binary(b) => Arc::new(b.finish()), - VectorBuilder::Date(b) => Arc::new(b.finish()), - VectorBuilder::DateTime(b) => Arc::new(b.finish()), - VectorBuilder::Timestamp(b) => Arc::new(b.finish()), - } - } -} - -#[cfg(test)] -mod tests { - use ordered_float::OrderedFloat; - - use super::*; - use crate::prelude::Vector; - use crate::vectors::date::DateVector; - use crate::vectors::datetime::DateTimeVector; - - macro_rules! impl_integer_builder_test { - ($Type: ident, $datatype: ident) => { - let data_type = ConcreteDataType::$datatype(); - let mut builder = VectorBuilder::with_capacity(data_type.clone(), 10); - assert_eq!(data_type, builder.data_type()); - - for i in 0..10 { - builder.push(&Value::$Type(i)); - } - for i in 10..20 { - builder.try_push_ref(ValueRef::$Type(i)).unwrap(); - } - let vector = builder.finish(); - - for i in 0..20 { - assert_eq!(Value::$Type(i), vector.get(i as usize)); - } - - let mut builder = VectorBuilder::new(ConcreteDataType::$datatype()); - builder.push(&Value::Null); - builder.push(&Value::$Type(100)); - builder.try_push_ref(ValueRef::Null).unwrap(); - builder.try_push_ref(ValueRef::$Type(101)).unwrap(); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - format!( - "Failed to cast value Boolean(true) to primitive type {}", - stringify!($Type) - ), - ); - - let vector = builder.finish(); - - assert!(vector.is_null(0)); - assert_eq!(Value::$Type(100), vector.get(1)); - assert!(vector.is_null(2)); - assert_eq!(Value::$Type(101), vector.get(3)); - }; - } - - #[test] - fn test_null_vector_builder() { - let mut builder = VectorBuilder::new(ConcreteDataType::null_datatype()); - assert_eq!(ConcreteDataType::null_datatype(), builder.data_type()); - builder.push(&Value::Null); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "unable to accept non-null value in NullVectorBuilder" - ); - - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert!(vector.is_null(0)); - assert!(vector.is_null(1)); - } - - #[test] - fn test_integer_vector_builder() { - impl_integer_builder_test!(UInt8, uint8_datatype); - impl_integer_builder_test!(UInt16, uint16_datatype); - impl_integer_builder_test!(UInt32, uint32_datatype); - impl_integer_builder_test!(UInt64, uint64_datatype); - impl_integer_builder_test!(Int8, int8_datatype); - impl_integer_builder_test!(Int16, int16_datatype); - impl_integer_builder_test!(Int32, int32_datatype); - impl_integer_builder_test!(Int64, int64_datatype); - } - - #[test] - fn test_float_vector_builder() { - let data_type = ConcreteDataType::float32_datatype(); - let mut builder = VectorBuilder::new(data_type.clone()); - assert_eq!(data_type, builder.data_type()); - - builder.push(&Value::Float32(OrderedFloat(1.0))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value Boolean(true) to primitive type Float32" - ); - - builder - .try_push_ref(ValueRef::Float32(OrderedFloat(2.0))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::Float32(OrderedFloat(1.0)), vector.get(0)); - assert_eq!(Value::Float32(OrderedFloat(2.0)), vector.get(1)); - assert_eq!(Value::Null, vector.get(2)); - - let mut builder = VectorBuilder::new(ConcreteDataType::float64_datatype()); - builder.push(&Value::Float64(OrderedFloat(2.0))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value Boolean(true) to primitive type Float64" - ); - - builder - .try_push_ref(ValueRef::Float64(OrderedFloat(3.0))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::Float64(OrderedFloat(2.0)), vector.get(0)); - assert_eq!(Value::Float64(OrderedFloat(3.0)), vector.get(1)); - assert_eq!(Value::Null, vector.get(2)); - } - - #[test] - fn test_binary_vector_builder() { - let data_type = ConcreteDataType::binary_datatype(); - let hello: &[u8] = b"hello"; - let mut builder = VectorBuilder::new(data_type.clone()); - assert_eq!(data_type, builder.data_type()); - builder.push(&Value::Binary(hello.into())); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to Binary" - ); - - builder.try_push_ref(ValueRef::Binary(b"world")).unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::Binary(hello.into()), vector.get(0)); - assert_eq!(ValueRef::Binary(b"world"), vector.get_ref(1)); - assert_eq!(Value::Null, vector.get(2)); - } - - #[test] - fn test_string_vector_builder() { - let data_type = ConcreteDataType::string_datatype(); - let hello = "hello"; - let mut builder = VectorBuilder::new(data_type.clone()); - assert_eq!(data_type, builder.data_type()); - builder.push(&Value::String(hello.into())); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to String" - ); - - builder.try_push_ref(ValueRef::String("world")).unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let vector = builder.finish(); - assert_eq!(Value::String(hello.into()), vector.get(0)); - assert_eq!(ValueRef::String("world"), vector.get_ref(1)); - assert_eq!(Value::Null, vector.get(2)); - } - - #[test] - pub fn test_date_vector_builder() { - let mut builder = VectorBuilder::with_capacity(ConcreteDataType::date_datatype(), 3); - assert_eq!(ConcreteDataType::date_datatype(), builder.data_type()); - builder.push_null(); - builder.push(&Value::Date(Date::new(123))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to Date" - ); - - builder - .try_push_ref(ValueRef::Date(Date::new(456))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let v = builder.finish(); - let v = v.as_any().downcast_ref::().unwrap(); - assert_eq!(Value::Null, v.get(0)); - assert_eq!(Value::Date(Date::new(123)), v.get(1)); - assert_eq!(ValueRef::Date(Date::new(456)), v.get_ref(2)); - assert_eq!(ValueRef::Null, v.get_ref(3)); - assert_eq!( - &arrow::datatypes::DataType::Date32, - v.to_arrow_array().data_type() - ); - } - - #[test] - pub fn test_datetime_vector_builder() { - let mut builder = VectorBuilder::with_capacity(ConcreteDataType::datetime_datatype(), 3); - assert_eq!(ConcreteDataType::datetime_datatype(), builder.data_type()); - builder.push_null(); - builder.push(&Value::DateTime(DateTime::new(123))); - - let result = builder.try_push_ref(ValueRef::Boolean(true)); - assert!(result.is_err()); - assert_eq!( - result.unwrap_err().to_string(), - "Failed to cast value ref Boolean(true) to DateTime" - ); - - builder - .try_push_ref(ValueRef::DateTime(DateTime::new(456))) - .unwrap(); - builder.try_push_ref(ValueRef::Null).unwrap(); - - let v = builder.finish(); - let v = v.as_any().downcast_ref::().unwrap(); - assert_eq!(Value::Null, v.get(0)); - assert_eq!(Value::DateTime(DateTime::new(123)), v.get(1)); - assert_eq!(ValueRef::DateTime(DateTime::new(456)), v.get_ref(2)); - assert_eq!(ValueRef::Null, v.get_ref(3)); - assert_eq!( - &arrow::datatypes::DataType::Date64, - v.to_arrow_array().data_type() - ); - } -} diff --git a/src/datatypes/src/vectors/constant.rs b/src/datatypes/src/vectors/constant.rs index d5522007a1..87739e9131 100644 --- a/src/datatypes/src/vectors/constant.rs +++ b/src/datatypes/src/vectors/constant.rs @@ -55,6 +55,27 @@ impl ConstantVector { pub fn get_constant_ref(&self) -> ValueRef { self.vector.get_ref(0) } + + pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), self.len()); + + if offsets.is_empty() { + return self.slice(0, 0); + } + + Arc::new(ConstantVector::new( + self.vector.clone(), + *offsets.last().unwrap(), + )) + } + + pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { + let length = self.len() - filter.false_count(); + if length == self.len() { + return Ok(Arc::new(self.clone())); + } + Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) + } } impl Vector for ConstantVector { @@ -90,9 +111,9 @@ impl Vector for ConstantVector { fn validity(&self) -> Validity { if self.vector.is_null(0) { - Validity::AllNull + Validity::all_null(self.length) } else { - Validity::AllValid + Validity::all_valid(self.length) } } @@ -122,6 +143,14 @@ impl Vector for ConstantVector { fn get_ref(&self, _index: usize) -> ValueRef { self.vector.get_ref(0) } + + fn null_count(&self) -> usize { + if self.only_null() { + self.len() + } else { + 0 + } + } } impl fmt::Debug for ConstantVector { @@ -140,33 +169,6 @@ impl Serializable for ConstantVector { } } -pub(crate) fn replicate_constant(vector: &ConstantVector, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.slice(0, 0); - } - - Arc::new(ConstantVector::new( - vector.vector.clone(), - *offsets.last().unwrap(), - )) -} - -pub(crate) fn filter_constant( - vector: &ConstantVector, - filter: &BooleanVector, -) -> Result { - let length = filter.len() - filter.as_boolean_array().values().null_count(); - if length == vector.len() { - return Ok(Arc::new(vector.clone())); - } - Ok(Arc::new(ConstantVector::new( - vector.inner().clone(), - length, - ))) -} - #[cfg(test)] mod tests { use arrow::datatypes::DataType as ArrowDataType; @@ -182,9 +184,9 @@ mod tests { assert_eq!("ConstantVector", c.vector_type_name()); assert!(c.is_const()); assert_eq!(10, c.len()); - assert_eq!(Validity::AllValid, c.validity()); + assert!(c.validity().is_all_valid()); assert!(!c.only_null()); - assert_eq!(4, c.memory_size()); + assert_eq!(64, c.memory_size()); for i in 0..10 { assert!(!c.is_null(i)); diff --git a/src/datatypes/src/vectors/date.rs b/src/datatypes/src/vectors/date.rs index 0198b3622f..d0a66b80fb 100644 --- a/src/datatypes/src/vectors/date.rs +++ b/src/datatypes/src/vectors/date.rs @@ -12,258 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; +use crate::types::DateType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::date::Date; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::*; -use crate::scalars::ScalarVector; -use crate::serialize::Serializable; -use crate::vectors::{MutableVector, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -#[derive(Debug, Clone, PartialEq)] -pub struct DateVector { - array: PrimitiveVector, -} - -impl DateVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for DateVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::date_datatype() - } - - fn vector_type_name(&self) -> String { - "DateVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date32, - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date32, - buffer, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector::new(self.array.array.slice(offset, length)), - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Int32(v) => Value::Date(Date::new(v)), - Value::Null => Value::Null, - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int32(v) => ValueRef::Date(Date::new(v)), - Value::Null => ValueRef::Null, - _ => { - unreachable!() - } - } - } -} - -impl From>> for DateVector { - fn from(data: Vec>) -> Self { - Self { - array: PrimitiveVector::::from(data), - } - } -} - -pub struct DateIter<'a> { - iter: PrimitiveIter<'a, i32>, -} - -impl<'a> Iterator for DateIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(Date::new)) - } -} - -impl ScalarVector for DateVector { - type OwnedItem = Date; - type RefItem<'a> = Date; - type Iter<'a> = DateIter<'a>; - - type Builder = DateVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(Date::new) - } - - fn iter_data(&self) -> Self::Iter<'_> { - DateIter { - iter: self.array.iter_data(), - } - } -} - -impl Serializable for DateVector { - fn serialize_to_json(&self) -> Result> { - Ok(self - .array - .iter_data() - .map(|v| v.map(Date::new)) - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -pub struct DateVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl MutableVector for DateVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::date_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_date()?.map(|d| d.val())); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -impl ScalarVectorBuilder for DateVectorBuilder { - type VectorType = DateVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value.map(|d| d.val())) - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -pub(crate) fn replicate_date(vector: &DateVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(DateVector { array }) -} +// Vector for [`Date`](common_time::Date). +pub type DateVector = PrimitiveVector; +// Builder to build DateVector. +pub type DateVectorBuilder = PrimitiveVectorBuilder; #[cfg(test)] mod tests { + use std::sync::Arc; + + use arrow::array::Array; + use common_time::date::Date; + use super::*; use crate::data_type::DataType; + use crate::scalars::{ScalarVector, ScalarVectorBuilder}; + use crate::serialize::Serializable; use crate::types::DateType; + use crate::value::{Value, ValueRef}; + use crate::vectors::{Vector, VectorRef}; #[test] fn test_build_date_vector() { @@ -288,7 +58,7 @@ mod tests { #[test] fn test_date_scalar() { - let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); + let vector = DateVector::from_slice(&[1, 2]); assert_eq!(2, vector.len()); assert_eq!(Some(Date::new(1)), vector.get_data(0)); assert_eq!(Some(Date::new(2)), vector.get_data(1)); @@ -296,7 +66,7 @@ mod tests { #[test] fn test_date_vector_builder() { - let input = DateVector::from_slice(&[Date::new(1), Date::new(2), Date::new(3)]); + let input = DateVector::from_slice(&[1, 2, 3]); let mut builder = DateType::default().create_mutable_vector(3); builder @@ -309,19 +79,25 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateVector::from_slice(&[ - Date::new(5), - Date::new(2), - Date::new(3), - ])); + let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); assert_eq!(expect, vector); } #[test] fn test_date_from_arrow() { - let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); + let vector = DateVector::from_slice(&[1, 2]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); } + + #[test] + fn test_serialize_date_vector() { + let vector = DateVector::from_slice(&[-1, 0, 1]); + let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!( + r#"["1969-12-31","1970-01-01","1970-01-02"]"#, + serialized_json + ); + } } diff --git a/src/datatypes/src/vectors/datetime.rs b/src/datatypes/src/vectors/datetime.rs index 732e56004c..a40a3e54d3 100644 --- a/src/datatypes/src/vectors/datetime.rs +++ b/src/datatypes/src/vectors/datetime.rs @@ -12,264 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; +use crate::types::DateTimeType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::datetime::DateTime; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::{ - MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, -}; -use crate::serialize::Serializable; -use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -#[derive(Debug, Clone, PartialEq)] -pub struct DateTimeVector { - array: PrimitiveVector, -} - -impl DateTimeVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for DateTimeVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::datetime_datatype() - } - - fn vector_type_name(&self) -> String { - "DateTimeVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date64, - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date64, - buffer, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector::new(self.array.array.slice(offset, length)), - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Int64(v) => Value::DateTime(DateTime::new(v)), - Value::Null => Value::Null, - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int64(v) => ValueRef::DateTime(DateTime::new(v)), - Value::Null => ValueRef::Null, - _ => { - unreachable!() - } - } - } -} - -impl Serializable for DateTimeVector { - fn serialize_to_json(&self) -> crate::Result> { - Ok(self - .array - .iter_data() - .map(|v| v.map(DateTime::new)) - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -impl From>> for DateTimeVector { - fn from(data: Vec>) -> Self { - Self { - array: PrimitiveVector::::from(data), - } - } -} - -pub struct DateTimeVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl ScalarVectorBuilder for DateTimeVectorBuilder { - type VectorType = DateTimeVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value.map(|d| d.val())) - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -impl MutableVector for DateTimeVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::datetime_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_datetime()?.map(|d| d.val())); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -pub struct DateTimeIter<'a> { - iter: PrimitiveIter<'a, i64>, -} - -impl<'a> Iterator for DateTimeIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(DateTime::new)) - } -} - -impl ScalarVector for DateTimeVector { - type OwnedItem = DateTime; - type RefItem<'a> = DateTime; - type Iter<'a> = DateTimeIter<'a>; - type Builder = DateTimeVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(DateTime::new) - } - - fn iter_data(&self) -> Self::Iter<'_> { - DateTimeIter { - iter: self.array.iter_data(), - } - } -} - -pub(crate) fn replicate_datetime(vector: &DateTimeVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(DateTimeVector { array }) -} +/// Vector of [`DateTime`](common_time::Date) +pub type DateTimeVector = PrimitiveVector; +/// Builder for [`DateTimeVector`]. +pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::sync::Arc; + + use arrow::array::{Array, PrimitiveArray}; + use common_time::DateTime; + use datafusion_common::from_slice::FromSlice; use super::*; use crate::data_type::DataType; - use crate::types::DateTimeType; + use crate::prelude::{ + ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, + }; + use crate::serialize::Serializable; #[test] fn test_datetime_vector() { - let v = DateTimeVector::new(PrimitiveArray::from_vec(vec![1, 2, 3])); + let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); assert_eq!(3, v.len()); assert_eq!("DateTimeVector", v.vector_type_name()); @@ -287,9 +55,8 @@ mod tests { assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); assert!(!v.is_null(0)); - assert_eq!(24, v.memory_size()); // size of i64 * 3 + assert_eq!(64, v.memory_size()); - assert_matches!(v.validity(), Validity::AllValid); if let Value::DateTime(d) = v.get(0) { assert_eq!(1, d.val()); } else { @@ -314,8 +81,11 @@ mod tests { assert_eq!(Value::Null, v.get(1)); assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); - let input = - DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2), DateTime::new(3)]); + let input = DateTimeVector::from_wrapper_slice(&[ + DateTime::new(1), + DateTime::new(2), + DateTime::new(3), + ]); let mut builder = DateTimeType::default().create_mutable_vector(3); builder @@ -328,7 +98,7 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateTimeVector::from_slice(&[ + let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ DateTime::new(5), DateTime::new(2), DateTime::new(3), @@ -338,7 +108,7 @@ mod tests { #[test] fn test_datetime_from_arrow() { - let vector = DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2)]); + let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); diff --git a/src/datatypes/src/vectors/eq.rs b/src/datatypes/src/vectors/eq.rs index d47167c3f9..55359026d4 100644 --- a/src/datatypes/src/vectors/eq.rs +++ b/src/datatypes/src/vectors/eq.rs @@ -15,9 +15,12 @@ use std::sync::Arc; use crate::data_type::DataType; +use crate::types::TimestampType; +use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, - PrimitiveVector, StringVector, TimestampVector, Vector, + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, + StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, + TimestampNanosecondVector, TimestampSecondVector, Vector, }; use crate::with_match_primitive_type_id; @@ -76,7 +79,20 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { String(_) => is_vector_eq!(StringVector, lhs, rhs), Date(_) => is_vector_eq!(DateVector, lhs, rhs), DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), - Timestamp(_) => is_vector_eq!(TimestampVector, lhs, rhs), + Timestamp(t) => match t { + TimestampType::Second(_) => { + is_vector_eq!(TimestampSecondVector, lhs, rhs) + } + TimestampType::Millisecond(_) => { + is_vector_eq!(TimestampMillisecondVector, lhs, rhs) + } + TimestampType::Microsecond(_) => { + is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) + } + TimestampType::Nanosecond(_) => { + is_vector_eq!(TimestampNanosecondVector, lhs, rhs) + } + }, List(_) => is_vector_eq!(ListVector, lhs, rhs), UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) | Float32(_) | Float64(_) => { @@ -95,13 +111,10 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { #[cfg(test)] mod tests { - use arrow::array::{ListArray, MutableListArray, MutablePrimitiveArray, TryExtend}; - use super::*; use crate::vectors::{ - Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, - NullVector, TimestampVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, - VectorRef, + list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, + NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, }; fn assert_vector_ref_eq(vector: VectorRef) { @@ -132,14 +145,21 @@ mod tests { assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(TimestampVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); - let mut arrow_array = MutableListArray::>::new(); - arrow_array - .try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])]) - .unwrap(); - let arrow_array: ListArray = arrow_array.into(); - assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array))); + let list_vector = list::tests::new_list_vector(&[ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), Some(4)]), + ]); + assert_vector_ref_eq(Arc::new(list_vector)); assert_vector_ref_eq(Arc::new(NullVector::new(4))); assert_vector_ref_eq(Arc::new(StringVector::from(vec![ diff --git a/src/datatypes/src/vectors/helper.rs b/src/datatypes/src/vectors/helper.rs index 60a9f8511f..f3236ca0ec 100644 --- a/src/datatypes/src/vectors/helper.rs +++ b/src/datatypes/src/vectors/helper.rs @@ -17,19 +17,26 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::Array; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow::compute; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::compute::kernels::comparison; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; use datafusion_common::ScalarValue; use snafu::{OptionExt, ResultExt}; -use crate::arrow_array::StringArray; -use crate::error::{ConversionSnafu, Result, UnknownVectorSnafu}; -use crate::scalars::*; -use crate::vectors::date::DateVector; -use crate::vectors::datetime::DateTimeVector; -use crate::vectors::*; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{Scalar, ScalarVectorBuilder}; +use crate::value::{ListValue, ListValueRef}; +use crate::vectors::{ + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, + Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, + ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, +}; +/// Helper functions for `Vector`. pub struct Helper; impl Helper { @@ -47,7 +54,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -61,7 +68,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -78,7 +85,7 @@ impl Helper { let arr = vector .as_mut_any() .downcast_mut() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", ty, @@ -94,7 +101,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -105,11 +112,9 @@ impl Helper { } /// Try to cast an arrow scalar value into vector - /// - /// # Panics - /// Panic if given scalar value is not supported. pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { let vector = match value { + ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), ScalarValue::Boolean(v) => { ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) } @@ -143,17 +148,29 @@ impl Helper { ScalarValue::UInt64(v) => { ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) } - ScalarValue::Utf8(v) => { + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) } - ScalarValue::LargeUtf8(v) => { - ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - } - ScalarValue::Binary(v) => { + ScalarValue::Binary(v) + | ScalarValue::LargeBinary(v) + | ScalarValue::FixedSizeBinary(_, v) => { ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) } - ScalarValue::LargeBinary(v) => { - ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) + ScalarValue::List(v, field) => { + let item_type = ConcreteDataType::try_from(field.data_type())?; + let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); + if let Some(values) = v { + let values = values + .into_iter() + .map(ScalarValue::try_into) + .collect::>()?; + let list_value = ListValue::new(Some(Box::new(values)), item_type); + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + let list_vector = builder.to_vector(); + ConstantVector::new(list_vector, length) } ScalarValue::Date32(v) => { ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) @@ -161,8 +178,30 @@ impl Helper { ScalarValue::Date64(v) => { ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) } - _ => { - return ConversionSnafu { + ScalarValue::TimestampSecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMillisecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMicrosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) + } + ScalarValue::TimestampNanosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) + } + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { + return error::ConversionSnafu { from: format!("Unsupported scalar value: {}", value), } .fail() @@ -180,9 +219,7 @@ impl Helper { Ok(match array.as_ref().data_type() { ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::Binary | ArrowDataType::LargeBinary => { - Arc::new(BinaryVector::try_from_arrow_array(array)?) - } + ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), @@ -193,48 +230,80 @@ impl Helper { ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { - Arc::new(StringVector::try_from_arrow_array(array)?) - } + ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), - ArrowDataType::Timestamp(_, _) => { - Arc::new(TimestampVector::try_from_arrow_array(array)?) + ArrowDataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), + TimeUnit::Millisecond => { + Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Microsecond => { + Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Nanosecond => { + Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) + } + }, + ArrowDataType::Float16 + | ArrowDataType::Time32(_) + | ArrowDataType::Time64(_) + | ArrowDataType::Duration(_) + | ArrowDataType::Interval(_) + | ArrowDataType::Binary + | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::LargeUtf8 + | ArrowDataType::LargeList(_) + | ArrowDataType::FixedSizeList(_, _) + | ArrowDataType::Struct(_) + | ArrowDataType::Union(_, _, _) + | ArrowDataType::Dictionary(_, _) + | ArrowDataType::Decimal128(_, _) + | ArrowDataType::Decimal256(_, _) + | ArrowDataType::Map(_, _) => { + unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) } - _ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()), }) } + /// Try to cast slice of `arrays` to vectors. pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { arrays.iter().map(Self::try_into_vector).collect() } + /// Perform SQL like operation on `names` and a scalar `s`. pub fn like_utf8(names: Vec, s: &str) -> Result { - let array = StringArray::from_slice(&names); + let array = StringArray::from(names); - let filter = - compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; + let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?; + let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; Helper::try_into_vector(result) } } #[cfg(test)] mod tests { - use arrow::array::Int32Array; - use common_time::date::Date; - use common_time::datetime::DateTime; + use arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; + use arrow::datatypes::{Field, Int32Type}; + use common_time::{Date, DateTime}; use super::*; + use crate::value::Value; + use crate::vectors::ConcreteDataType; #[test] fn test_try_into_vectors() { let arrays: Vec = vec![ - Arc::new(Int32Array::from_vec(vec![1])), - Arc::new(Int32Array::from_vec(vec![2])), - Arc::new(Int32Array::from_vec(vec![3])), + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![2])), + Arc::new(Int32Array::from(vec![3])), ]; let vectors = Helper::try_into_vectors(&arrays); assert!(vectors.is_ok()); @@ -246,10 +315,10 @@ mod tests { } #[test] - pub fn test_try_into_date_vector() { + fn test_try_into_date_vector() { let vector = DateVector::from(vec![Some(1), Some(2), None]); let arrow_array = vector.to_arrow_array(); - assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type()); + assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); assert_eq!(vector.len(), vector_converted.len()); for i in 0..vector_converted.len() { @@ -258,7 +327,7 @@ mod tests { } #[test] - pub fn test_try_from_scalar_date_value() { + fn test_try_from_scalar_date_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -268,7 +337,7 @@ mod tests { } #[test] - pub fn test_try_from_scalar_datetime_value() { + fn test_try_from_scalar_datetime_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -277,6 +346,28 @@ mod tests { } } + #[test] + fn test_try_from_list_value() { + let value = ScalarValue::List( + Some(vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Int32(Some(2)), + ]), + Box::new(Field::new("item", ArrowDataType::Int32, true)), + ); + let vector = Helper::try_from_scalar_value(value, 3).unwrap(); + assert_eq!( + ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), + vector.data_type() + ); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + let v = vector.get(i); + let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); + assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); + } + } + #[test] fn test_like_utf8() { fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { @@ -301,4 +392,40 @@ mod tests { let ret = Helper::like_utf8(names, "%").unwrap(); assert_vector(vec!["greptime", "hello", "public", "world"], &ret); } + + fn check_try_into_vector(array: impl Array + 'static) { + let array: ArrayRef = Arc::new(array); + let vector = Helper::try_into_vector(array.clone()).unwrap(); + assert_eq!(&array, &vector.to_arrow_array()); + } + + #[test] + fn test_try_into_vector() { + check_try_into_vector(NullArray::new(2)); + check_try_into_vector(BooleanArray::from(vec![true, false])); + check_try_into_vector(LargeBinaryArray::from(vec![ + "hello".as_bytes(), + "world".as_bytes(), + ])); + check_try_into_vector(Int8Array::from(vec![1, 2, 3])); + check_try_into_vector(Int16Array::from(vec![1, 2, 3])); + check_try_into_vector(Int32Array::from(vec![1, 2, 3])); + check_try_into_vector(Int64Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); + check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(StringArray::from(vec!["hello", "world"])); + check_try_into_vector(Date32Array::from(vec![1, 2, 3])); + check_try_into_vector(Date64Array::from(vec![1, 2, 3])); + let data = vec![None, Some(vec![Some(6), Some(7)])]; + let list_array = ListArray::from_iter_primitive::(data); + check_try_into_vector(list_array); + check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); + } } diff --git a/src/datatypes/src/vectors/list.rs b/src/datatypes/src/vectors/list.rs index 76d9dd8717..747e03557b 100644 --- a/src/datatypes/src/vectors/list.rs +++ b/src/datatypes/src/vectors/list.rs @@ -13,39 +13,48 @@ // limitations under the License. use std::any::Any; -use std::ops::Range; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, ListArray}; -use arrow::bitmap::utils::ZipValidity; -use arrow::bitmap::MutableBitmap; +use arrow::array::{ + Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, +}; +use arrow::buffer::Buffer; use arrow::datatypes::DataType as ArrowDataType; use serde_json::Value as JsonValue; -use snafu::prelude::*; +use crate::data_type::{ConcreteDataType, DataType}; use crate::error::Result; -use crate::prelude::*; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; use crate::types::ListType; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::{impl_try_from_arrow_array_for_vector, impl_validity_for_vector}; - -type ArrowListArray = ListArray; +use crate::value::{ListValue, ListValueRef, Value, ValueRef}; +use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; /// Vector of Lists, basically backed by Arrow's `ListArray`. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, PartialEq)] pub struct ListVector { - array: ArrowListArray, - inner_datatype: ConcreteDataType, + array: ListArray, + /// The datatype of the items in the list. + item_type: ConcreteDataType, } impl ListVector { - /// Only iterate values in the [ListVector]. - /// - /// Be careful to use this method as it would ignore validity and replace null - /// by empty vector. - pub fn values_iter(&self) -> Box> + '_> { - Box::new(self.array.values_iter().map(VectorHelper::try_into_vector)) + /// Iterate elements as [VectorRef]. + pub fn values_iter(&self) -> impl Iterator>> + '_ { + self.array + .iter() + .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { + Self { + array: ListArray::from(data), + item_type, + } } pub(crate) fn as_arrow(&self) -> &dyn Array { @@ -55,7 +64,7 @@ impl ListVector { impl Vector for ListVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(self.inner_datatype.clone())) + ConcreteDataType::List(ListType::new(self.item_type.clone())) } fn vector_type_name(&self) -> String { @@ -71,21 +80,25 @@ impl Vector for ListVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(ListArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(ListArray::from(data)) } fn validity(&self) -> Validity { - impl_validity_for_vector!(self.array) + vectors::impl_validity_for_vector!(self.array) } fn memory_size(&self) -> usize { - let offsets_bytes = self.array.offsets().len() * std::mem::size_of::(); - let value_refs_bytes = self.array.values().len() * std::mem::size_of::>(); - offsets_bytes + value_refs_bytes + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -93,7 +106,8 @@ impl Vector for ListVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(ListVector::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) } fn get(&self, index: usize) -> Value { @@ -102,7 +116,7 @@ impl Vector for ListVector { } let array = &self.array.value(index); - let vector = VectorHelper::try_into_vector(array).unwrap_or_else(|_| { + let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { panic!( "arrow array with datatype {:?} cannot converted to our vector", array.data_type() @@ -113,7 +127,7 @@ impl Vector for ListVector { .collect::>(); Value::List(ListValue::new( Some(Box::new(values)), - self.inner_datatype.clone(), + self.item_type.clone(), )) } @@ -131,7 +145,7 @@ impl Serializable for ListVector { .iter() .map(|v| match v { None => Ok(JsonValue::Null), - Some(v) => VectorHelper::try_into_vector(v) + Some(v) => Helper::try_into_vector(v) .and_then(|v| v.serialize_to_json()) .map(JsonValue::Array), }) @@ -139,70 +153,64 @@ impl Serializable for ListVector { } } -impl From for ListVector { - fn from(array: ArrowListArray) -> Self { - let inner_datatype = ConcreteDataType::from_arrow_type(match array.data_type() { - ArrowDataType::List(field) => &field.data_type, - _ => unreachable!(), +impl From for ListVector { + fn from(array: ListArray) -> Self { + let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { + ArrowDataType::List(field) => field.data_type(), + other => panic!( + "Try to create ListVector from an arrow array with type {:?}", + other + ), }); - Self { - array, - inner_datatype, - } + Self { array, item_type } } } -impl_try_from_arrow_array_for_vector!(ArrowListArray, ListVector); +vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); -pub struct ListVectorIter<'a> { +pub struct ListIter<'a> { vector: &'a ListVector, - iter: ZipValidity<'a, usize, Range>, + idx: usize, } -impl<'a> ListVectorIter<'a> { - pub fn new(vector: &'a ListVector) -> ListVectorIter<'a> { - let iter = ZipValidity::new( - 0..vector.len(), - vector.array.validity().as_ref().map(|x| x.iter()), - ); - - Self { vector, iter } +impl<'a> ListIter<'a> { + fn new(vector: &'a ListVector) -> ListIter { + ListIter { vector, idx: 0 } } } -impl<'a> Iterator for ListVectorIter<'a> { +impl<'a> Iterator for ListIter<'a> { type Item = Option>; #[inline] fn next(&mut self) -> Option { - self.iter.next().map(|idx_opt| { - idx_opt.map(|idx| ListValueRef::Indexed { - vector: self.vector, - idx, - }) - }) + if self.idx >= self.vector.len() { + return None; + } + + let idx = self.idx; + self.idx += 1; + + if self.vector.is_null(idx) { + return Some(None); + } + + Some(Some(ListValueRef::Indexed { + vector: self.vector, + idx, + })) } #[inline] fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } - - #[inline] - fn nth(&mut self, n: usize) -> Option { - self.iter.nth(n).map(|idx_opt| { - idx_opt.map(|idx| ListValueRef::Indexed { - vector: self.vector, - idx, - }) - }) + (self.vector.len(), Some(self.vector.len())) } } impl ScalarVector for ListVector { type OwnedItem = ListValue; type RefItem<'a> = ListValueRef<'a>; - type Iter<'a> = ListVectorIter<'a>; + type Iter<'a> = ListIter<'a>; type Builder = ListVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -214,86 +222,68 @@ impl ScalarVector for ListVector { } fn iter_data(&self) -> Self::Iter<'_> { - ListVectorIter::new(self) + ListIter::new(self) } } -// Some codes are ported from arrow2's MutableListArray. +// Ports from arrow's GenericListBuilder. +// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs +/// [ListVector] builder. pub struct ListVectorBuilder { - inner_type: ConcreteDataType, - offsets: Vec, - values: Box, - validity: Option, + item_type: ConcreteDataType, + offsets_builder: Int32BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: Box, } impl ListVectorBuilder { - pub fn with_type_capacity(inner_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); - // The actual required capacity might greater than the capacity of the `ListVector` - // if there exists child vector that has more than one element. - let values = inner_type.create_mutable_vector(capacity); + /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` + /// is the number of items to pre-allocate space for in this builder. + pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { + let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); + offsets_builder.append(0); + // The actual required capacity might be greater than the capacity of the `ListVector` + // if the child vector has more than one element. + let values_builder = item_type.create_mutable_vector(capacity); ListVectorBuilder { - inner_type, - offsets, - values, - validity: None, + item_type, + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, } } - #[inline] - fn last_offset(&self) -> i32 { - *self.offsets.last().unwrap() + /// Finish the current variable-length list vector slot. + fn finish_list(&mut self, is_valid: bool) { + self.offsets_builder + .append(i32::try_from(self.values_builder.len()).unwrap()); + self.null_buffer_builder.append(is_valid); } fn push_null(&mut self) { - self.offsets.push(self.last_offset()); - match &mut self.validity { - Some(validity) => validity.push(false), - None => self.init_validity(), - } - } - - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; - - let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); - validity.extend_constant(len, true); - validity.set(len - 1, false); - self.validity = Some(validity) + self.finish_list(false); } fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { if let Some(items) = list_value.items() { for item in &**items { - self.values.push_value_ref(item.as_value_ref())?; + self.values_builder.push_value_ref(item.as_value_ref())?; } } - self.push_valid(); + + self.finish_list(true); Ok(()) } - - /// Needs to be called when a valid value was extended to this builder. - fn push_valid(&mut self) { - let size = self.values.len(); - let size = i32::try_from(size).unwrap(); - assert!(size >= *self.offsets.last().unwrap()); - - self.offsets.push(size); - if let Some(validity) = &mut self.validity { - validity.push(true) - } - } } impl MutableVector for ListVectorBuilder { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::list_datatype(self.inner_type.clone()) + ConcreteDataType::list_datatype(self.item_type.clone()) } fn len(&self) -> usize { - self.offsets.len() - 1 + self.null_buffer_builder.len() } fn as_any(&self) -> &dyn Any { @@ -348,51 +338,181 @@ impl ScalarVectorBuilder for ListVectorBuilder { self.push_value_ref(value.into()).unwrap_or_else(|e| { panic!( "Failed to push value, expect value type {:?}, err:{}", - self.inner_type, e + self.item_type, e ); }); } fn finish(&mut self) -> Self::VectorType { - let array = ArrowListArray::try_new( - ConcreteDataType::list_datatype(self.inner_type.clone()).as_arrow_type(), - std::mem::take(&mut self.offsets).into(), - self.values.to_vector().to_arrow_array(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - .unwrap(); // The `ListVectorBuilder` itself should ensure it always builds a valid array. + let len = self.len(); + let values_vector = self.values_builder.to_vector(); + let values_arr = values_vector.to_arrow_array(); + let values_data = values_arr.data(); + + let offset_buffer = self.offsets_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); + // Re-initialize the offsets_builder. + self.offsets_builder.append(0); + let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); + let array_data_builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data.clone()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data_builder.build_unchecked() }; + let array = ListArray::from(array_data); ListVector { array, - inner_datatype: self.inner_type.clone(), + item_type: self.item_type.clone(), + } + } +} + +// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs +/// Builder for creating the null bit buffer. +/// This builder only materializes the buffer when we append `false`. +/// If you only append `true`s to the builder, what you get will be +/// `None` when calling [`finish`](#method.finish). +/// This optimization is **very** important for the performance. +#[derive(Debug)] +struct NullBufferBuilder { + bitmap_builder: Option, + /// Store the length of the buffer before materializing. + len: usize, + capacity: usize, +} + +impl NullBufferBuilder { + /// Creates a new empty builder. + /// `capacity` is the number of bits in the null buffer. + fn new(capacity: usize) -> Self { + Self { + bitmap_builder: None, + len: 0, + capacity, + } + } + + fn len(&self) -> usize { + if let Some(b) = &self.bitmap_builder { + b.len() + } else { + self.len + } + } + + /// Appends a `true` into the builder + /// to indicate that this item is not null. + #[inline] + fn append_non_null(&mut self) { + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append(true) + } else { + self.len += 1; + } + } + + /// Appends a `false` into the builder + /// to indicate that this item is null. + #[inline] + fn append_null(&mut self) { + self.materialize_if_needed(); + self.bitmap_builder.as_mut().unwrap().append(false); + } + + /// Appends a boolean value into the builder. + #[inline] + fn append(&mut self, not_null: bool) { + if not_null { + self.append_non_null() + } else { + self.append_null() + } + } + + /// Builds the null buffer and resets the builder. + /// Returns `None` if the builder only contains `true`s. + fn finish(&mut self) -> Option { + let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); + self.bitmap_builder = None; + self.len = 0; + buf + } + + #[inline] + fn materialize_if_needed(&mut self) { + if self.bitmap_builder.is_none() { + self.materialize() + } + } + + #[cold] + fn materialize(&mut self) { + if self.bitmap_builder.is_none() { + let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); + b.append_n(self.len, true); + self.bitmap_builder = Some(b); } } } #[cfg(test)] -mod tests { - use arrow::array::{MutableListArray, MutablePrimitiveArray, TryExtend}; +pub mod tests { + use arrow::array::{Int32Array, Int32Builder, ListBuilder}; use serde_json::json; use super::*; + use crate::scalars::ScalarRef; use crate::types::ListType; + use crate::vectors::Int32Vector; + + pub fn new_list_vector(data: &[Option>>]) -> ListVector { + let mut builder = + ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); + for vec_opt in data { + if let Some(vec) = vec_opt { + let values = vec.iter().map(|v| Value::from(*v)).collect(); + let values = Some(Box::new(values)); + let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); + + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + } + + builder.finish() + } + + fn new_list_array(data: &[Option>>]) -> ListArray { + let mut builder = ListBuilder::new(Int32Builder::new()); + for vec_opt in data { + if let Some(vec) = vec_opt { + for value_opt in vec { + builder.values().append_option(*value_opt); + } + + builder.append(true); + } else { + builder.append(false); + } + } + + builder.finish() + } #[test] fn test_list_vector() { let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = new_list_vector(&data); - let list_vector = ListVector { - array: arrow_array.clone(), - inner_datatype: ConcreteDataType::int32_datatype(), - }; assert_eq!( ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() @@ -403,30 +523,34 @@ mod tests { assert!(list_vector.is_null(1)); assert!(!list_vector.is_null(2)); + let arrow_array = new_list_array(&data); assert_eq!( arrow_array, - list_vector + *list_vector .to_arrow_array() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() - .clone() - ); - assert_eq!( - Validity::Slots(arrow_array.validity().unwrap()), - list_vector.validity() - ); - assert_eq!( - arrow_array.offsets().len() * std::mem::size_of::() - + arrow_array.values().len() * std::mem::size_of::>(), - list_vector.memory_size() ); + let validity = list_vector.validity(); + assert!(!validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert!(validity.is_set(0)); + assert!(!validity.is_set(1)); + assert!(validity.is_set(2)); + assert_eq!(256, list_vector.memory_size()); - let slice = list_vector.slice(0, 2); + let slice = list_vector.slice(0, 2).to_arrow_array(); + let sliced_array = slice.as_any().downcast_ref::().unwrap(); assert_eq!( - "ListArray[[1, 2, 3], None]", - format!("{:?}", slice.to_arrow_array()) + Int32Array::from_iter_values([1, 2, 3]), + *sliced_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() ); + assert!(sliced_array.is_null(1)); assert_eq!( Value::List(ListValue::new( @@ -467,52 +591,48 @@ mod tests { #[test] fn test_from_arrow_array() { let data = vec![ - Some(vec![Some(1u32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let arrow_array = new_list_array(&data); let array_ref: ArrayRef = Arc::new(arrow_array); + let expect = new_list_vector(&data); + // Test try from ArrayRef let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); - assert_eq!( - "ListVector { array: ListArray[[1, 2, 3], None, [4, None, 6]], inner_datatype: UInt32(UInt32) }", - format!("{:?}", list_vector) - ); + assert_eq!(expect, list_vector); + + // Test from + let arrow_array = new_list_array(&data); + let list_vector = ListVector::from(arrow_array); + assert_eq!(expect, list_vector); } #[test] fn test_iter_list_vector_values() { let data = vec![ - Some(vec![Some(1i64), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = new_list_vector(&data); - let list_vector = ListVector::from(arrow_array); assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int64_datatype())), + ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() ); let mut iter = list_vector.values_iter(); assert_eq!( - "Int64[1, 2, 3]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap() ); + assert!(iter.next().unwrap().unwrap().is_none()); assert_eq!( - "Int64[]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) - ); - assert_eq!( - "Int64[4, None, 6]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap(), ); assert!(iter.next().is_none()) } @@ -520,30 +640,18 @@ mod tests { #[test] fn test_serialize_to_json() { let data = vec![ - Some(vec![Some(1i64), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); - - let list_vector = ListVector::from(arrow_array); + let list_vector = new_list_vector(&data); assert_eq!( vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], list_vector.serialize_to_json().unwrap() ); } - fn new_list_vector(data: Vec>>>) -> ListVector { - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); - - ListVector::from(arrow_array) - } - #[test] fn test_list_vector_builder() { let mut builder = @@ -567,14 +675,14 @@ mod tests { None, Some(vec![Some(7), Some(8), None]), ]; - let input = new_list_vector(data); + let input = new_list_vector(&data); builder.extend_slice_of(&input, 1, 2).unwrap(); assert!(builder .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(new_list_vector(vec![ + let expect: VectorRef = Arc::new(new_list_vector(&[ Some(vec![Some(4), None, Some(6)]), None, Some(vec![Some(7), Some(8), None]), @@ -599,7 +707,7 @@ mod tests { })); let vector = builder.finish(); - let expect = new_list_vector(vec![None, Some(vec![Some(4), None, Some(6)])]); + let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); assert_eq!(expect, vector); assert!(vector.get_data(0).is_none()); diff --git a/src/datatypes/src/vectors/mutable.rs b/src/datatypes/src/vectors/mutable.rs deleted file mode 100644 index 5f94957460..0000000000 --- a/src/datatypes/src/vectors/mutable.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use crate::error::Result; -use crate::prelude::*; - -/// Mutable vector that could be used to build an immutable vector. -pub trait MutableVector: Send + Sync { - /// Returns the data type of the vector. - fn data_type(&self) -> ConcreteDataType; - - /// Returns the length of the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert to Any, to enable dynamic casting. - fn as_any(&self) -> &dyn Any; - - /// Convert to mutable Any, to enable dynamic casting. - fn as_mut_any(&mut self) -> &mut dyn Any; - - /// Convert `self` to an (immutable) [VectorRef] and reset `self`. - fn to_vector(&mut self) -> VectorRef; - - /// Push value ref to this mutable vector. - /// - /// Returns error if data type unmatch. - fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; - - /// Extend this mutable vector by slice of `vector`. - /// - /// Returns error if data type unmatch. - /// - /// # Panics - /// Panics if `offset + length > vector.len()`. - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; -} diff --git a/src/datatypes/src/vectors/null.rs b/src/datatypes/src/vectors/null.rs index 64974d99b0..bb66e09b39 100644 --- a/src/datatypes/src/vectors/null.rs +++ b/src/datatypes/src/vectors/null.rs @@ -16,8 +16,7 @@ use std::any::Any; use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, NullArray}; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; use snafu::{ensure, OptionExt}; use crate::data_type::ConcreteDataType; @@ -27,21 +26,28 @@ use crate::types::NullType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; +/// A vector where all elements are nulls. #[derive(PartialEq)] pub struct NullVector { array: NullArray, } +// TODO(yingwen): Support null vector with other logical types. impl NullVector { + /// Create a new `NullVector` with `n` elements. pub fn new(n: usize) -> Self { Self { - array: NullArray::new(ArrowDataType::Null, n), + array: NullArray::new(n), } } pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } } impl From for NullVector { @@ -68,21 +74,28 @@ impl Vector for NullVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. + let data = self.to_array_data(); + Arc::new(NullArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(NullArray::from(data)) } fn validity(&self) -> Validity { - Validity::AllNull + Validity::all_null(self.array.len()) } fn memory_size(&self) -> usize { 0 } + fn null_count(&self) -> usize { + self.array.null_count() + } + fn is_null(&self, _row: usize) -> bool { true } @@ -217,7 +230,7 @@ mod tests { assert_eq!("NullVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllNull, v.validity()); + assert!(v.validity().is_all_null()); assert!(v.only_null()); for i in 0..32 { @@ -246,7 +259,7 @@ mod tests { #[test] fn test_null_vector_validity() { let vector = NullVector::new(5); - assert_eq!(Validity::AllNull, vector.validity()); + assert!(vector.validity().is_all_null()); assert_eq!(5, vector.null_count()); } diff --git a/src/datatypes/src/vectors/operations.rs b/src/datatypes/src/vectors/operations.rs index e63f338a05..70ddb4a031 100644 --- a/src/datatypes/src/vectors/operations.rs +++ b/src/datatypes/src/vectors/operations.rs @@ -19,10 +19,11 @@ mod replicate; use common_base::BitVec; use crate::error::Result; -use crate::types::PrimitiveElement; +use crate::types::LogicalPrimitiveType; +use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, - NullVector, PrimitiveVector, StringVector, TimestampVector, Vector, VectorRef, + BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, + VectorRef, }; /// Vector compute operations. @@ -59,10 +60,10 @@ pub trait VectorOp { } macro_rules! impl_scalar_vector_op { - ($( { $VectorType: ident, $replicate: ident } ),+) => {$( + ($($VectorType: ident),+) => {$( impl VectorOp for $VectorType { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::$replicate(self, offsets) + replicate::replicate_scalar(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { @@ -77,28 +78,21 @@ macro_rules! impl_scalar_vector_op { )+}; } -impl_scalar_vector_op!( - { BinaryVector, replicate_scalar }, - { BooleanVector, replicate_scalar }, - { ListVector, replicate_scalar }, - { StringVector, replicate_scalar }, - { DateVector, replicate_date }, - { DateTimeVector, replicate_datetime }, - { TimestampVector, replicate_timestamp } -); +impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); -impl VectorOp for ConstantVector { +impl VectorOp for PrimitiveVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_constant(self, offsets) + std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_constant(self, selected, prev_vector); + let prev_vector = + prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); + find_unique::find_unique_scalar(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_constant(self, filter) + filter::filter_non_constant!(self, PrimitiveVector, filter) } } @@ -117,21 +111,17 @@ impl VectorOp for NullVector { } } -impl VectorOp for PrimitiveVector -where - T: PrimitiveElement, -{ +impl VectorOp for ConstantVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_primitive(self, offsets) + self.replicate_vector(offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = - prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); - find_unique::find_unique_scalar(self, selected, prev_vector); + let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); + find_unique::find_unique_constant(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, PrimitiveVector, filter) + self.filter_vector(filter) } } diff --git a/src/datatypes/src/vectors/operations/filter.rs b/src/datatypes/src/vectors/operations/filter.rs index 7a9f514a16..8368a6afb4 100644 --- a/src/datatypes/src/vectors/operations/filter.rs +++ b/src/datatypes/src/vectors/operations/filter.rs @@ -12,16 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub(crate) use crate::vectors::constant::filter_constant; - macro_rules! filter_non_constant { ($vector: expr, $VectorType: ty, $filter: ident) => {{ use std::sync::Arc; + use arrow::compute; use snafu::ResultExt; let arrow_array = $vector.as_arrow(); - let filtered = arrow::compute::filter::filter(arrow_array, $filter.as_boolean_array()) + let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) .context(crate::error::ArrowComputeSnafu)?; Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) }}; @@ -33,9 +32,16 @@ pub(crate) use filter_non_constant; mod tests { use std::sync::Arc; + use common_time::{Date, DateTime}; + use crate::scalars::ScalarVector; + use crate::timestamp::{ + TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, + }; + use crate::types::WrapperType; + use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BooleanVector, ConstantVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, + BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, }; fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { @@ -105,7 +111,6 @@ mod tests { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ use std::sync::Arc; - use common_time::$ValueType; use $crate::vectors::{$VectorType, VectorRef}; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -123,6 +128,18 @@ mod tests { fn test_filter_date_like() { impl_filter_date_like_test!(DateVector, Date, new); impl_filter_date_like_test!(DateTimeVector, DateTime, new); - impl_filter_date_like_test!(TimestampVector, Timestamp, from_millis); + + impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); + impl_filter_date_like_test!( + TimestampMillisecondVector, + TimestampMillisecond, + from_native + ); + impl_filter_date_like_test!( + TimestampMicrosecondVector, + TimestampMicrosecond, + from_native + ); + impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); } } diff --git a/src/datatypes/src/vectors/operations/find_unique.rs b/src/datatypes/src/vectors/operations/find_unique.rs index d63a3c66b9..7116a9e90d 100644 --- a/src/datatypes/src/vectors/operations/find_unique.rs +++ b/src/datatypes/src/vectors/operations/find_unique.rs @@ -15,7 +15,8 @@ use common_base::BitVec; use crate::scalars::ScalarVector; -use crate::vectors::{ConstantVector, NullVector, Vector}; +use crate::vectors::constant::ConstantVector; +use crate::vectors::{NullVector, Vector}; // To implement `find_unique()` correctly, we need to keep in mind that always marks an element as // selected when it is different from the previous one, and leaves the `selected` unchanged @@ -70,7 +71,7 @@ pub(crate) fn find_unique_null( return; } - let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true); + let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); if is_first_not_duplicate { selected.set(0, true); } @@ -104,8 +105,11 @@ pub(crate) fn find_unique_constant( mod tests { use std::sync::Arc; + use common_time::{Date, DateTime}; + use super::*; - use crate::vectors::{Int32Vector, StringVector, VectorOp}; + use crate::timestamp::*; + use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; fn check_bitmap(expect: &[bool], selected: &BitVec) { let actual = selected.iter().collect::>(); @@ -121,7 +125,7 @@ mod tests { input: impl Iterator>, prev: Option<&[i32]>, ) { - let input = Int32Vector::from_iter(input); + let input = Int32Vector::from(input.collect::>()); let prev = prev.map(Int32Vector::from_slice); let mut selected = BitVec::repeat(false, input.len()); @@ -341,7 +345,6 @@ mod tests { macro_rules! impl_find_unique_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); @@ -356,6 +359,9 @@ mod tests { fn test_find_unique_date_like() { impl_find_unique_date_like_test!(DateVector, Date, new); impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); - impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis); + impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); + impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); + impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); + impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); } } diff --git a/src/datatypes/src/vectors/operations/replicate.rs b/src/datatypes/src/vectors/operations/replicate.rs index 7fb93134ed..8216517fc6 100644 --- a/src/datatypes/src/vectors/operations/replicate.rs +++ b/src/datatypes/src/vectors/operations/replicate.rs @@ -13,12 +13,8 @@ // limitations under the License. use crate::prelude::*; -pub(crate) use crate::vectors::constant::replicate_constant; -pub(crate) use crate::vectors::date::replicate_date; -pub(crate) use crate::vectors::datetime::replicate_datetime; pub(crate) use crate::vectors::null::replicate_null; pub(crate) use crate::vectors::primitive::replicate_primitive; -pub(crate) use crate::vectors::timestamp::replicate_timestamp; pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { assert_eq!(offsets.len(), c.len()); @@ -43,8 +39,13 @@ pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> Vec mod tests { use std::sync::Arc; + use common_time::timestamp::TimeUnit; + use common_time::{Date, DateTime, Timestamp}; + use paste::paste; + use super::*; - use crate::vectors::{ConstantVector, Int32Vector, NullVector, StringVector, VectorOp}; + use crate::vectors::constant::ConstantVector; + use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; #[test] fn test_replicate_primitive() { @@ -120,7 +121,6 @@ mod tests { macro_rules! impl_replicate_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -138,10 +138,33 @@ mod tests { }}; } + macro_rules! impl_replicate_timestamp_test { + ($unit: ident) => {{ + paste!{ + use $crate::vectors::[]; + use $crate::timestamp::[]; + let v = []::from_iterator((0..5).map([]::from)); + let offsets = [0, 1, 2, 3, 4]; + let v = v.replicate(&offsets); + assert_eq!(4, v.len()); + for i in 0..4 { + assert_eq!( + Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), + v.get(i) + ); + } + } + }}; + } + #[test] fn test_replicate_date_like() { impl_replicate_date_like_test!(DateVector, Date, new); impl_replicate_date_like_test!(DateTimeVector, DateTime, new); - impl_replicate_date_like_test!(TimestampVector, Timestamp, from_millis); + + impl_replicate_timestamp_test!(Second); + impl_replicate_timestamp_test!(Millisecond); + impl_replicate_timestamp_test!(Microsecond); + impl_replicate_timestamp_test!(Nanosecond); } } diff --git a/src/datatypes/src/vectors/primitive.rs b/src/datatypes/src/vectors/primitive.rs index c49295630c..7829c31731 100644 --- a/src/datatypes/src/vectors/primitive.rs +++ b/src/datatypes/src/vectors/primitive.rs @@ -13,75 +13,111 @@ // limitations under the License. use std::any::Any; -use std::iter::FromIterator; -use std::slice::Iter; +use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, MutableArray, MutablePrimitiveArray, PrimitiveArray}; -use arrow::bitmap::utils::ZipValidity; +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, +}; use serde_json::Value as JsonValue; -use snafu::{OptionExt, ResultExt}; +use snafu::OptionExt; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{ConversionSnafu, Result, SerializeSnafu}; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::{Primitive, PrimitiveElement}; +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, +}; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; +pub type UInt8Vector = PrimitiveVector; +pub type UInt16Vector = PrimitiveVector; +pub type UInt32Vector = PrimitiveVector; +pub type UInt64Vector = PrimitiveVector; + +pub type Int8Vector = PrimitiveVector; +pub type Int16Vector = PrimitiveVector; +pub type Int32Vector = PrimitiveVector; +pub type Int64Vector = PrimitiveVector; + +pub type Float32Vector = PrimitiveVector; +pub type Float64Vector = PrimitiveVector; + /// Vector for primitive data types. -#[derive(Debug, Clone, PartialEq)] -pub struct PrimitiveVector { - pub(crate) array: PrimitiveArray, +pub struct PrimitiveVector { + array: PrimitiveArray, } -impl PrimitiveVector { - pub fn new(array: PrimitiveArray) -> Self { +impl PrimitiveVector { + pub fn new(array: PrimitiveArray) -> Self { Self { array } } pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) + let data = array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + let concrete_array = PrimitiveArray::::from(data); + Ok(Self::new(concrete_array)) } - pub fn from_slice>(slice: P) -> Self { + pub fn from_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied(); Self { - array: PrimitiveArray::from_slice(slice), + array: PrimitiveArray::from_iter_values(iter), } } - pub fn from_vec(array: Vec) -> Self { + pub fn from_wrapper_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); Self { - array: PrimitiveArray::from_vec(array), + array: PrimitiveArray::from_iter_values(iter), } } - pub fn from_values>(iter: I) -> Self { + pub fn from_vec(array: Vec) -> Self { Self { - array: PrimitiveArray::from_values(iter), + array: PrimitiveArray::from_iter_values(array), } } - pub(crate) fn as_arrow(&self) -> &dyn Array { + pub fn from_values>(iter: I) -> Self { + Self { + array: PrimitiveArray::from_iter_values(iter), + } + } + + pub(crate) fn as_arrow(&self) -> &PrimitiveArray { &self.array } - fn slice(&self, offset: usize, length: usize) -> Self { - Self::from(self.array.slice(offset, length)) + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: PrimitiveArray::from(data), + } + } + + // To distinguish with `Vector::slice()`. + fn get_slice(&self, offset: usize, length: usize) -> Self { + let data = self.array.data().slice(offset, length); + Self::from_array_data(data) } } -impl Vector for PrimitiveVector { +impl Vector for PrimitiveVector { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -99,11 +135,13 @@ impl Vector for PrimitiveVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(PrimitiveArray::::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(PrimitiveArray::::from(data)) } fn validity(&self) -> Validity { @@ -111,7 +149,11 @@ impl Vector for PrimitiveVector { } fn memory_size(&self) -> usize { - self.array.values().len() * std::mem::size_of::() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -119,57 +161,80 @@ impl Vector for PrimitiveVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(self.slice(offset, length)) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) + if self.array.is_valid(index) { + // Safety: The index have been checked by `is_valid()`. + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() + } else { + Value::Null + } } fn get_ref(&self, index: usize) -> ValueRef { if self.array.is_valid(index) { // Safety: The index have been checked by `is_valid()`. - unsafe { self.array.value_unchecked(index).into_value_ref() } + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() } else { ValueRef::Null } } } -impl From> for PrimitiveVector { - fn from(array: PrimitiveArray) -> Self { +impl fmt::Debug for PrimitiveVector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("PrimitiveVector") + .field("array", &self.array) + .finish() + } +} + +impl From> for PrimitiveVector { + fn from(array: PrimitiveArray) -> Self { Self { array } } } -impl From>> for PrimitiveVector { - fn from(v: Vec>) -> Self { +impl From>> for PrimitiveVector { + fn from(v: Vec>) -> Self { Self { - array: PrimitiveArray::::from(v), + array: PrimitiveArray::from_iter(v), } } } -impl>> FromIterator for PrimitiveVector { - fn from_iter>(iter: I) -> Self { - Self { - array: MutablePrimitiveArray::::from_iter(iter).into(), - } +pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { + iter: ArrayIter<&'a PrimitiveArray>, +} + +impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { + type Item = Option; + + fn next(&mut self) -> Option> { + self.iter + .next() + .map(|item| item.map(T::Wrapper::from_native)) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() } } -impl ScalarVector for PrimitiveVector -where - T: PrimitiveElement, -{ - type OwnedItem = T; - type RefItem<'a> = T; +impl ScalarVector for PrimitiveVector { + type OwnedItem = T::Wrapper; + type RefItem<'a> = T::Wrapper; type Iter<'a> = PrimitiveIter<'a, T>; type Builder = PrimitiveVectorBuilder; fn get_data(&self, idx: usize) -> Option> { if self.array.is_valid(idx) { - Some(self.array.value(idx)) + Some(T::Wrapper::from_native(self.array.value(idx))) } else { None } @@ -182,59 +247,47 @@ where } } -pub type UInt8Vector = PrimitiveVector; -pub type UInt16Vector = PrimitiveVector; -pub type UInt32Vector = PrimitiveVector; -pub type UInt64Vector = PrimitiveVector; - -pub type Int8Vector = PrimitiveVector; -pub type Int16Vector = PrimitiveVector; -pub type Int32Vector = PrimitiveVector; -pub type Int64Vector = PrimitiveVector; - -pub type Float32Vector = PrimitiveVector; -pub type Float64Vector = PrimitiveVector; - -pub struct PrimitiveIter<'a, T> { - iter: ZipValidity<'a, &'a T, Iter<'a, T>>, -} - -impl<'a, T: Copy> Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - fn next(&mut self) -> Option> { - self.iter.next().map(|v| v.copied()) - } -} - -impl Serializable for PrimitiveVector { +impl Serializable for PrimitiveVector { fn serialize_to_json(&self) -> Result> { - self.array - .iter() - .map(serde_json::to_value) - .collect::>() - .context(SerializeSnafu) + let res = self + .iter_data() + .map(|v| match v { + None => serde_json::Value::Null, + // use WrapperType's Into bound instead of + // serde_json::to_value to facilitate customized serialization + // for WrapperType + Some(v) => v.into(), + }) + .collect::>(); + Ok(res) } } -pub struct PrimitiveVectorBuilder { - pub(crate) mutable_array: MutablePrimitiveArray, +impl PartialEq for PrimitiveVector { + fn eq(&self, other: &PrimitiveVector) -> bool { + self.array == other.array + } } -pub type UInt8VectorBuilder = PrimitiveVectorBuilder; -pub type UInt16VectorBuilder = PrimitiveVectorBuilder; -pub type UInt32VectorBuilder = PrimitiveVectorBuilder; -pub type UInt64VectorBuilder = PrimitiveVectorBuilder; +pub type UInt8VectorBuilder = PrimitiveVectorBuilder; +pub type UInt16VectorBuilder = PrimitiveVectorBuilder; +pub type UInt32VectorBuilder = PrimitiveVectorBuilder; +pub type UInt64VectorBuilder = PrimitiveVectorBuilder; -pub type Int8VectorBuilder = PrimitiveVectorBuilder; -pub type Int16VectorBuilder = PrimitiveVectorBuilder; -pub type Int32VectorBuilder = PrimitiveVectorBuilder; -pub type Int64VectorBuilder = PrimitiveVectorBuilder; +pub type Int8VectorBuilder = PrimitiveVectorBuilder; +pub type Int16VectorBuilder = PrimitiveVectorBuilder; +pub type Int32VectorBuilder = PrimitiveVectorBuilder; +pub type Int64VectorBuilder = PrimitiveVectorBuilder; -pub type Float32VectorBuilder = PrimitiveVectorBuilder; -pub type Float64VectorBuilder = PrimitiveVectorBuilder; +pub type Float32VectorBuilder = PrimitiveVectorBuilder; +pub type Float64VectorBuilder = PrimitiveVectorBuilder; -impl MutableVector for PrimitiveVectorBuilder { +/// Builder to build a primitive vector. +pub struct PrimitiveVectorBuilder { + mutable_array: PrimitiveBuilder, +} + +impl MutableVector for PrimitiveVectorBuilder { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -257,81 +310,62 @@ impl MutableVector for PrimitiveVectorBuilder { fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { let primitive = T::cast_value_ref(value)?; - self.mutable_array.push(primitive); + match primitive { + Some(v) => self.mutable_array.append_value(v.into_native()), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { let primitive = T::cast_vector(vector)?; // Slice the underlying array to avoid creating a new Arc. - let slice = primitive.slice(offset, length); - self.mutable_array.extend_trusted_len(slice.iter()); + let slice = primitive.get_slice(offset, length); + for v in slice.iter_data() { + self.push(v); + } Ok(()) } } impl ScalarVectorBuilder for PrimitiveVectorBuilder where - T: Scalar> + PrimitiveElement, - for<'a> T: ScalarRef<'a, ScalarType = T, VectorType = PrimitiveVector>, - for<'a> T: Scalar = T>, + T: LogicalPrimitiveType, + T::Wrapper: Scalar>, + for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, + for<'a> T::Wrapper: Scalar = T::Wrapper>, { type VectorType = PrimitiveVector; fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutablePrimitiveArray::with_capacity(capacity), + mutable_array: PrimitiveBuilder::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + self.mutable_array + .append_option(value.map(|v| v.into_native())); } fn finish(&mut self) -> Self::VectorType { PrimitiveVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } -impl PrimitiveVectorBuilder { - fn with_type_capacity(data_type: ConcreteDataType, capacity: usize) -> Self { - Self { - mutable_array: MutablePrimitiveArray::with_capacity_from( - capacity, - data_type.as_arrow_type(), - ), - } - } -} - -pub(crate) fn replicate_primitive( +pub(crate) fn replicate_primitive( vector: &PrimitiveVector, offsets: &[usize], -) -> VectorRef { - Arc::new(replicate_primitive_with_type( - vector, - offsets, - T::build_data_type(), - )) -} - -pub(crate) fn replicate_primitive_with_type( - vector: &PrimitiveVector, - offsets: &[usize], - data_type: ConcreteDataType, ) -> PrimitiveVector { assert_eq!(offsets.len(), vector.len()); if offsets.is_empty() { - return vector.slice(0, 0); + return vector.get_slice(0, 0); } - let mut builder = PrimitiveVectorBuilder::::with_type_capacity( - data_type, - *offsets.last().unwrap() as usize, - ); + let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); let mut previous_offset = 0; @@ -339,14 +373,15 @@ pub(crate) fn replicate_primitive_with_type( let repeat_times = *offset - previous_offset; match value { Some(data) => { - builder.mutable_array.extend_trusted_len( - std::iter::repeat(*data) - .take(repeat_times) - .map(Option::Some), - ); + unsafe { + // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. + builder + .mutable_array + .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); + } } None => { - builder.mutable_array.extend_constant(repeat_times, None); + builder.mutable_array.append_nulls(repeat_times); } } previous_offset = *offset; @@ -356,6 +391,7 @@ pub(crate) fn replicate_primitive_with_type( #[cfg(test)] mod tests { + use arrow::array::Int32Array; use arrow::datatypes::DataType as ArrowDataType; use serde_json; @@ -364,11 +400,11 @@ mod tests { use crate::serialize::Serializable; use crate::types::Int64Type; - fn check_vec(v: PrimitiveVector) { + fn check_vec(v: Int32Vector) { assert_eq!(4, v.len()); assert_eq!("Int32Vector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); for i in 0..4 { @@ -387,26 +423,26 @@ mod tests { #[test] fn test_from_values() { - let v = PrimitiveVector::::from_values(vec![1, 2, 3, 4]); + let v = Int32Vector::from_values(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_vec() { - let v = PrimitiveVector::::from_vec(vec![1, 2, 3, 4]); + let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_slice() { - let v = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_serialize_primitive_vector_with_null_to_json() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -421,15 +457,15 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]); - let v = PrimitiveVector::from(arrow_array); + let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); + let v = Int32Vector::from(arrow_array); check_vec(v); } #[test] fn test_primitive_vector_build_get() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -448,29 +484,28 @@ mod tests { #[test] fn test_primitive_vector_validity() { let input = [Some(1i32), Some(2i32), None, None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } let vector = builder.finish(); assert_eq!(2, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(2, slots.null_count()); - assert!(!slots.get_bit(2)); - assert!(!slots.get_bit(3)); + assert_eq!(2, validity.null_count()); + assert!(!validity.is_set(2)); + assert!(!validity.is_set(3)); - let vector = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); } #[test] fn test_memory_size() { - let v = PrimitiveVector::::from_slice((0..5).collect::>()); - assert_eq!(20, v.memory_size()); - let v = PrimitiveVector::::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); - assert_eq!(40, v.memory_size()); + let v = Int32Vector::from_slice((0..5).collect::>()); + assert_eq!(64, v.memory_size()); + let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); + assert_eq!(128, v.memory_size()); } #[test] @@ -489,4 +524,29 @@ mod tests { let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); assert_eq!(expect, vector); } + + #[test] + fn test_from_wrapper_slice() { + macro_rules! test_from_wrapper_slice { + ($vec: ident, $ty: ident) => { + let from_wrapper_slice = $vec::from_wrapper_slice(&[ + $ty::from_native($ty::MAX), + $ty::from_native($ty::MIN), + ]); + let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); + assert_eq!(from_wrapper_slice, from_slice); + }; + } + + test_from_wrapper_slice!(UInt8Vector, u8); + test_from_wrapper_slice!(Int8Vector, i8); + test_from_wrapper_slice!(UInt16Vector, u16); + test_from_wrapper_slice!(Int16Vector, i16); + test_from_wrapper_slice!(UInt32Vector, u32); + test_from_wrapper_slice!(Int32Vector, i32); + test_from_wrapper_slice!(UInt64Vector, u64); + test_from_wrapper_slice!(Int64Vector, i64); + test_from_wrapper_slice!(Float32Vector, f32); + test_from_wrapper_slice!(Float64Vector, f64); + } } diff --git a/src/datatypes/src/vectors/string.rs b/src/datatypes/src/vectors/string.rs index 638b04dd3e..252116b3b2 100644 --- a/src/datatypes/src/vectors/string.rs +++ b/src/datatypes/src/vectors/string.rs @@ -15,22 +15,19 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, MutableArray, Utf8ValuesIter}; -use arrow::bitmap::utils::ZipValidity; -use serde_json::Value as JsonValue; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; use crate::arrow_array::{MutableStringArray, StringArray}; use crate::data_type::ConcreteDataType; -use crate::error::{Result, SerializeSnafu}; +use crate::error::{self, Result}; use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::StringType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; -/// String array wrapper -#[derive(Debug, Clone, PartialEq)] +/// Vector of strings. +#[derive(Debug, PartialEq)] pub struct StringVector { array: StringArray, } @@ -39,6 +36,16 @@ impl StringVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: StringArray::from(data), + } + } } impl From for StringVector { @@ -50,19 +57,7 @@ impl From for StringVector { impl From>> for StringVector { fn from(data: Vec>) -> Self { Self { - array: StringArray::from(data), - } - } -} - -impl From> for StringVector { - fn from(data: Vec) -> Self { - Self { - array: StringArray::from( - data.into_iter() - .map(Option::Some) - .collect::>>(), - ), + array: StringArray::from_iter(data), } } } @@ -70,7 +65,31 @@ impl From> for StringVector { impl From>> for StringVector { fn from(data: Vec>) -> Self { Self { - array: StringArray::from(data), + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option]> for StringVector { + fn from(data: &[Option]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option<&str>]> for StringVector { + fn from(data: &[Option<&str>]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From> for StringVector { + fn from(data: Vec) -> Self { + Self { + array: StringArray::from_iter(data.into_iter().map(Some)), } } } @@ -78,18 +97,14 @@ impl From>> for StringVector { impl From> for StringVector { fn from(data: Vec<&str>) -> Self { Self { - array: StringArray::from( - data.into_iter() - .map(Option::Some) - .collect::>>(), - ), + array: StringArray::from_iter(data.into_iter().map(Some)), } } } impl Vector for StringVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::String(StringType::default()) + ConcreteDataType::string_datatype() } fn vector_type_name(&self) -> String { @@ -105,11 +120,13 @@ impl Vector for StringVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(StringArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(StringArray::from(data)) } fn validity(&self) -> Validity { @@ -117,7 +134,11 @@ impl Vector for StringVector { } fn memory_size(&self) -> usize { - self.len() * std::mem::size_of::() + self.array.values().len() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -125,7 +146,8 @@ impl Vector for StringVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -140,7 +162,7 @@ impl Vector for StringVector { impl ScalarVector for StringVector { type OwnedItem = String; type RefItem<'a> = &'a str; - type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>; + type Iter<'a> = ArrayIter<&'a StringArray>; type Builder = StringVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -157,7 +179,7 @@ impl ScalarVector for StringVector { } pub struct StringVectorBuilder { - buffer: MutableStringArray, + mutable_array: MutableStringArray, } impl MutableVector for StringVectorBuilder { @@ -166,7 +188,7 @@ impl MutableVector for StringVectorBuilder { } fn len(&self) -> usize { - self.buffer.len() + self.mutable_array.len() } fn as_any(&self) -> &dyn Any { @@ -182,12 +204,15 @@ impl MutableVector for StringVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_string()?); + match value.as_string()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.buffer, vector, StringVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) } } @@ -196,30 +221,30 @@ impl ScalarVectorBuilder for StringVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - buffer: MutableStringArray::with_capacity(capacity), + mutable_array: MutableStringArray::with_capacity(capacity, 0), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value) + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: std::mem::take(&mut self.buffer).into(), + StringVector { + array: self.mutable_array.finish(), } } } impl Serializable for StringVector { - fn serialize_to_json(&self) -> crate::error::Result> { + fn serialize_to_json(&self) -> Result> { self.iter_data() - .map(|v| match v { - None => Ok(serde_json::Value::Null), - Some(s) => serde_json::to_value(s), - }) + .map(serde_json::to_value) .collect::>() - .context(SerializeSnafu) + .context(error::SerializeSnafu) } } @@ -227,60 +252,9 @@ vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); #[cfg(test)] mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; + use arrow::datatypes::DataType; use super::*; - use crate::data_type::DataType; - - #[test] - fn test_string_vector_misc() { - let strs = vec!["hello", "greptime", "rust"]; - let v = StringVector::from(strs.clone()); - assert_eq!(3, v.len()); - assert_eq!("StringVector", v.vector_type_name()); - assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); - assert!(!v.only_null()); - assert_eq!(41, v.memory_size()); - - for (i, s) in strs.iter().enumerate() { - assert_eq!(Value::from(*s), v.get(i)); - assert_eq!(ValueRef::from(*s), v.get_ref(i)); - assert_eq!(Value::from(*s), v.try_get(i).unwrap()); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(3, arrow_arr.len()); - assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_string_vector() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let string_vector = builder.finish(); - let serialized = - serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["hello",null,"world"]"#, serialized); - } - - #[test] - fn test_from_arrow_array() { - let mut builder = MutableStringArray::new(); - builder.push(Some("A")); - builder.push(Some("B")); - builder.push::<&str>(None); - builder.push(Some("D")); - let string_array: StringArray = builder.into(); - let vector = StringVector::from(string_array); - assert_eq!( - r#"["A","B",null,"D"]"#, - serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), - ); - } #[test] fn test_string_vector_build_get() { @@ -310,7 +284,7 @@ mod tests { #[test] fn test_string_vector_builder() { - let mut builder = StringType::default().create_mutable_vector(3); + let mut builder = StringVectorBuilder::with_capacity(3); builder.push_value_ref(ValueRef::String("hello")).unwrap(); assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); @@ -324,4 +298,73 @@ mod tests { let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); assert_eq!(expect, vector); } + + #[test] + fn test_string_vector_misc() { + let strs = vec!["hello", "greptime", "rust"]; + let v = StringVector::from(strs.clone()); + assert_eq!(3, v.len()); + assert_eq!("StringVector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_valid()); + assert!(!v.only_null()); + assert_eq!(128, v.memory_size()); + + for (i, s) in strs.iter().enumerate() { + assert_eq!(Value::from(*s), v.get(i)); + assert_eq!(ValueRef::from(*s), v.get_ref(i)); + assert_eq!(Value::from(*s), v.try_get(i).unwrap()); + } + + let arrow_arr = v.to_arrow_array(); + assert_eq!(3, arrow_arr.len()); + assert_eq!(&DataType::Utf8, arrow_arr.data_type()); + } + + #[test] + fn test_serialize_string_vector() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let string_vector = builder.finish(); + let serialized = + serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["hello",null,"world"]"#, serialized); + } + + #[test] + fn test_from_arrow_array() { + let mut builder = MutableStringArray::new(); + builder.append_option(Some("A")); + builder.append_option(Some("B")); + builder.append_null(); + builder.append_option(Some("D")); + let string_array: StringArray = builder.finish(); + let vector = StringVector::from(string_array); + assert_eq!( + r#"["A","B",null,"D"]"#, + serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), + ); + } + + #[test] + fn test_from_non_option_string() { + let nul = String::from_utf8(vec![0]).unwrap(); + let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); + + let corpus = vec![ + "🀀🀀🀀".to_string(), + "🀁🀁🀁".to_string(), + "🀂🀂🀂".to_string(), + "🀃🀃🀃".to_string(), + "🀆🀆".to_string(), + ]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); + } } diff --git a/src/datatypes/src/vectors/timestamp.rs b/src/datatypes/src/vectors/timestamp.rs index 62b8332c89..5d9f7f2ed1 100644 --- a/src/datatypes/src/vectors/timestamp.rs +++ b/src/datatypes/src/vectors/timestamp.rs @@ -12,308 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::timestamp::{TimeUnit, Timestamp}; -use snafu::OptionExt; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error; -use crate::error::Result; -use crate::prelude::{ - MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, +use crate::types::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, }; -use crate::serialize::Serializable; -use crate::types::TimestampType; -use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -/// `TimestampVector` stores timestamp in millisecond since UNIX Epoch. -#[derive(Debug, Clone, PartialEq)] -pub struct TimestampVector { - array: PrimitiveVector, -} +pub type TimestampSecondVector = PrimitiveVector; +pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; -impl TimestampVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } +pub type TimestampMillisecondVector = PrimitiveVector; +pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } +pub type TimestampMicrosecondVector = PrimitiveVector; +pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; - pub fn from_values>(iter: I) -> Self { - Self { - array: PrimitiveVector { - array: PrimitiveArray::from_values(iter), - }, - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for TimestampVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::timestamp_millis_datatype() - } - - fn vector_type_name(&self) -> String { - "TimestampVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - TimestampType::new(TimeUnit::Millisecond).as_arrow_type(), - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let values = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), - values, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector { - array: self.array.array.slice(offset, length), - }, - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Null => Value::Null, - Value::Int64(v) => Value::Timestamp(Timestamp::from_millis(v)), - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int64(v) => ValueRef::Timestamp(Timestamp::from_millis(v)), - Value::Null => ValueRef::Null, - _ => unreachable!(), - } - } -} - -impl Serializable for TimestampVector { - fn serialize_to_json(&self) -> Result> { - Ok(self - .array - .iter_data() - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -impl ScalarVector for TimestampVector { - type OwnedItem = Timestamp; - type RefItem<'a> = Timestamp; - type Iter<'a> = TimestampDataIter<'a>; - type Builder = TimestampVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(Timestamp::from_millis) - } - - fn iter_data(&self) -> Self::Iter<'_> { - TimestampDataIter { - iter: self.array.iter_data(), - } - } -} - -pub struct TimestampDataIter<'a> { - iter: PrimitiveIter<'a, i64>, -} - -impl<'a> Iterator for TimestampDataIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(Timestamp::from_millis)) - } -} - -pub struct TimestampVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl MutableVector for TimestampVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::timestamp_millis_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - // TODO(hl): vector and vector builder should also support customized time unit. - self.buffer.push( - value - .as_timestamp()? - .map(|t| t.convert_to(TimeUnit::Millisecond)), - ); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -impl ScalarVectorBuilder for TimestampVectorBuilder { - type VectorType = TimestampVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - /// Pushes a Timestamp value into vector builder. The timestamp must be with time unit - /// `Second`/`MilliSecond`/`Microsecond`. - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer - .push(value.map(|v| v.convert_to(TimeUnit::Millisecond))); - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -pub(crate) fn replicate_timestamp(vector: &TimestampVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(TimestampVector { array }) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - pub fn test_build_timestamp_vector() { - let mut builder = TimestampVectorBuilder::with_capacity(3); - builder.push(Some(Timestamp::new(1, TimeUnit::Second))); - builder.push(None); - builder.push(Some(Timestamp::new(2, TimeUnit::Millisecond))); - - let vector = builder.finish(); - assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - vector.data_type() - ); - assert_eq!(3, vector.len()); - assert_eq!( - Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)), - vector.get(0) - ); - - assert_eq!(Value::Null, vector.get(1)); - assert_eq!( - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)), - vector.get(2) - ); - - assert_eq!( - vec![ - Some(Timestamp::new(1000, TimeUnit::Millisecond)), - None, - Some(Timestamp::new(2, TimeUnit::Millisecond)), - ], - vector.iter_data().collect::>() - ); - } - - #[test] - fn test_timestamp_from_arrow() { - let vector = - TimestampVector::from_slice(&[Timestamp::from_millis(1), Timestamp::from_millis(2)]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = TimestampVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } -} +pub type TimestampNanosecondVector = PrimitiveVector; +pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; diff --git a/src/datatypes2/src/vectors/validity.rs b/src/datatypes/src/vectors/validity.rs similarity index 100% rename from src/datatypes2/src/vectors/validity.rs rename to src/datatypes/src/vectors/validity.rs diff --git a/src/datatypes2/Cargo.toml b/src/datatypes2/Cargo.toml deleted file mode 100644 index ea60219544..0000000000 --- a/src/datatypes2/Cargo.toml +++ /dev/null @@ -1,24 +0,0 @@ -[package] -name = "datatypes2" -version = "0.1.0" -edition = "2021" -license = "Apache-2.0" - -[features] -default = [] -test = [] - -[dependencies] -arrow = "26.0" -common-base = { path = "../common/base" } -common-error = { path = "../common/error" } -common-time = { path = "../common/time" } -datafusion-common = "14.0" -enum_dispatch = "0.3" -num = "0.4" -num-traits = "0.2" -ordered-float = { version = "3.0", features = ["serde"] } -paste = "1.0" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -snafu = { version = "0.7", features = ["backtraces"] } diff --git a/src/datatypes2/src/arrow_array.rs b/src/datatypes2/src/arrow_array.rs deleted file mode 100644 index 7405c8a665..0000000000 --- a/src/datatypes2/src/arrow_array.rs +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::array::{ - Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, -}; -use arrow::datatypes::DataType; -use common_time::timestamp::TimeUnit; -use common_time::Timestamp; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{ConversionSnafu, Result}; -use crate::value::{ListValue, Value}; - -pub type BinaryArray = arrow::array::LargeBinaryArray; -pub type MutableBinaryArray = arrow::array::LargeBinaryBuilder; -pub type StringArray = arrow::array::StringArray; -pub type MutableStringArray = arrow::array::StringBuilder; - -macro_rules! cast_array { - ($arr: ident, $CastType: ty) => { - $arr.as_any() - .downcast_ref::<$CastType>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", $arr.data_type()), - })? - }; -} - -// TODO(yingwen): Remove this function. -pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { - if array.is_null(idx) { - return Ok(Value::Null); - } - - let result = match array.data_type() { - DataType::Null => Value::Null, - DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)), - DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()), - DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)), - DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)), - DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)), - DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)), - DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)), - DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)), - DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)), - DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)), - DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()), - DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()), - DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()), - DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()), - DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()), - DataType::Timestamp(t, _) => match t { - arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampSecondArray).value(idx), - TimeUnit::Second, - )), - arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx), - TimeUnit::Millisecond, - )), - arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx), - TimeUnit::Microsecond, - )), - arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx), - TimeUnit::Nanosecond, - )), - }, - DataType::List(_) => { - let array = cast_array!(array, ListArray).value(idx); - let item_type = ConcreteDataType::try_from(array.data_type())?; - let values = (0..array.len()) - .map(|i| arrow_array_get(&*array, i)) - .collect::>>()?; - Value::List(ListValue::new(Some(Box::new(values)), item_type)) - } - _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), - }; - - Ok(result) -} - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use arrow::array::{ - BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }; - use arrow::datatypes::Int32Type; - use common_time::timestamp::{TimeUnit, Timestamp}; - use paste::paste; - - use super::*; - use crate::data_type::ConcreteDataType; - use crate::types::TimestampType; - - macro_rules! test_arrow_array_get_for_timestamps { - ( $($unit: ident), *) => { - $( - paste! { - let mut builder = arrow::array::[]::builder(3); - builder.append_value(1); - builder.append_value(0); - builder.append_value(-1); - let ts_array = Arc::new(builder.finish()) as Arc; - let v = arrow_array_get(&ts_array, 1).unwrap(); - assert_eq!( - ConcreteDataType::Timestamp(TimestampType::$unit( - $crate::types::[]::default(), - )), - v.data_type() - ); - } - )* - }; - } - - #[test] - fn test_timestamp_array() { - test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond]; - } - - #[test] - fn test_arrow_array_access() { - let array1 = BooleanArray::from(vec![true, true, false, false]); - assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int8Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt8Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int16Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt16Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int32Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt32Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); - let array = Int64Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); - let array1 = UInt64Array::from(vec![1, 2, 3, 4]); - assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]); - assert_eq!( - Value::Float32(2f32.into()), - arrow_array_get(&array1, 1).unwrap() - ); - let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]); - assert_eq!( - Value::Float64(2f64.into()), - arrow_array_get(&array1, 1).unwrap() - ); - - let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); - assert_eq!( - Value::String("hello".into()), - arrow_array_get(&array2, 0).unwrap() - ); - assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - - let array3 = LargeBinaryArray::from(vec![ - Some("hello".as_bytes()), - None, - Some("world".as_bytes()), - ]); - assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - - let array = TimestampSecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second))); - let array = TimestampMillisecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) - ); - let array = TimestampMicrosecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond)) - ); - let array = TimestampNanosecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond)) - ); - - // test list array - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - let arrow_array = ListArray::from_iter_primitive::(data); - - let v0 = arrow_array_get(&arrow_array, 0).unwrap(); - match v0 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!( - **items, - vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] - ); - } - _ => unreachable!(), - } - - assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); - let v2 = arrow_array_get(&arrow_array, 2).unwrap(); - match v2 { - Value::List(list) => { - assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); - let items = list.items().as_ref().unwrap(); - assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); - } - _ => unreachable!(), - } - } -} diff --git a/src/datatypes2/src/data_type.rs b/src/datatypes2/src/data_type.rs deleted file mode 100644 index 0d06d566b6..0000000000 --- a/src/datatypes2/src/data_type.rs +++ /dev/null @@ -1,486 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; -use common_time::timestamp::TimeUnit; -use paste::paste; -use serde::{Deserialize, Serialize}; - -use crate::error::{self, Error, Result}; -use crate::type_id::LogicalTypeId; -use crate::types::{ - BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, -}; -use crate::value::Value; -use crate::vectors::MutableVector; - -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[enum_dispatch::enum_dispatch(DataType)] -pub enum ConcreteDataType { - Null(NullType), - Boolean(BooleanType), - - // Numeric types: - Int8(Int8Type), - Int16(Int16Type), - Int32(Int32Type), - Int64(Int64Type), - UInt8(UInt8Type), - UInt16(UInt16Type), - UInt32(UInt32Type), - UInt64(UInt64Type), - Float32(Float32Type), - Float64(Float64Type), - - // String types: - Binary(BinaryType), - String(StringType), - - // Date types: - Date(DateType), - DateTime(DateTimeType), - Timestamp(TimestampType), - - // Compound types: - List(ListType), -} - -// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method -// returning all these properties to the `DataType` trait -impl ConcreteDataType { - pub fn is_float(&self) -> bool { - matches!( - self, - ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) - ) - } - - pub fn is_boolean(&self) -> bool { - matches!(self, ConcreteDataType::Boolean(_)) - } - - pub fn is_stringifiable(&self) -> bool { - matches!( - self, - ConcreteDataType::String(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::Timestamp(_) - ) - } - - pub fn is_signed(&self) -> bool { - matches!( - self, - ConcreteDataType::Int8(_) - | ConcreteDataType::Int16(_) - | ConcreteDataType::Int32(_) - | ConcreteDataType::Int64(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::Timestamp(_) - ) - } - - pub fn is_unsigned(&self) -> bool { - matches!( - self, - ConcreteDataType::UInt8(_) - | ConcreteDataType::UInt16(_) - | ConcreteDataType::UInt32(_) - | ConcreteDataType::UInt64(_) - ) - } - - pub fn numerics() -> Vec { - vec![ - ConcreteDataType::int8_datatype(), - ConcreteDataType::int16_datatype(), - ConcreteDataType::int32_datatype(), - ConcreteDataType::int64_datatype(), - ConcreteDataType::uint8_datatype(), - ConcreteDataType::uint16_datatype(), - ConcreteDataType::uint32_datatype(), - ConcreteDataType::uint64_datatype(), - ConcreteDataType::float32_datatype(), - ConcreteDataType::float64_datatype(), - ] - } - - /// Convert arrow data type to [ConcreteDataType]. - /// - /// # Panics - /// Panic if given arrow data type is not supported. - pub fn from_arrow_type(dt: &ArrowDataType) -> Self { - ConcreteDataType::try_from(dt).expect("Unimplemented type") - } - - pub fn is_null(&self) -> bool { - matches!(self, ConcreteDataType::Null(NullType)) - } -} - -impl TryFrom<&ArrowDataType> for ConcreteDataType { - type Error = Error; - - fn try_from(dt: &ArrowDataType) -> Result { - let concrete_type = match dt { - ArrowDataType::Null => Self::null_datatype(), - ArrowDataType::Boolean => Self::boolean_datatype(), - ArrowDataType::UInt8 => Self::uint8_datatype(), - ArrowDataType::UInt16 => Self::uint16_datatype(), - ArrowDataType::UInt32 => Self::uint32_datatype(), - ArrowDataType::UInt64 => Self::uint64_datatype(), - ArrowDataType::Int8 => Self::int8_datatype(), - ArrowDataType::Int16 => Self::int16_datatype(), - ArrowDataType::Int32 => Self::int32_datatype(), - ArrowDataType::Int64 => Self::int64_datatype(), - ArrowDataType::Float32 => Self::float32_datatype(), - ArrowDataType::Float64 => Self::float64_datatype(), - ArrowDataType::Date32 => Self::date_datatype(), - ArrowDataType::Date64 => Self::datetime_datatype(), - ArrowDataType::Timestamp(u, _) => ConcreteDataType::from_arrow_time_unit(u), - ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), - ArrowDataType::List(field) => Self::List(ListType::new( - ConcreteDataType::from_arrow_type(field.data_type()), - )), - _ => { - return error::UnsupportedArrowTypeSnafu { - arrow_type: dt.clone(), - } - .fail() - } - }; - - Ok(concrete_type) - } -} - -macro_rules! impl_new_concrete_type_functions { - ($($Type: ident), +) => { - paste! { - impl ConcreteDataType { - $( - pub fn [<$Type:lower _datatype>]() -> ConcreteDataType { - ConcreteDataType::$Type([<$Type Type>]::default()) - } - )+ - } - } - } -} - -impl_new_concrete_type_functions!( - Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, - Binary, Date, DateTime, String -); - -impl ConcreteDataType { - pub fn timestamp_second_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) - } - - pub fn timestamp_millisecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Millisecond( - TimestampMillisecondType::default(), - )) - } - - pub fn timestamp_microsecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Microsecond( - TimestampMicrosecondType::default(), - )) - } - - pub fn timestamp_nanosecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) - } - - pub fn timestamp_datatype(unit: TimeUnit) -> Self { - match unit { - TimeUnit::Second => Self::timestamp_second_datatype(), - TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), - TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), - TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), - } - } - - /// Converts from arrow timestamp unit to - pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { - match t { - ArrowTimeUnit::Second => Self::timestamp_second_datatype(), - ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), - ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), - ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), - } - } - - pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(item_type)) - } -} - -/// Data type abstraction. -#[enum_dispatch::enum_dispatch] -pub trait DataType: std::fmt::Debug + Send + Sync { - /// Name of this data type. - fn name(&self) -> &str; - - /// Returns id of the Logical data type. - fn logical_type_id(&self) -> LogicalTypeId; - - /// Returns the default value of this type. - fn default_value(&self) -> Value; - - /// Convert this type as [arrow::datatypes::DataType]. - fn as_arrow_type(&self) -> ArrowDataType; - - /// Creates a mutable vector with given `capacity` of this type. - fn create_mutable_vector(&self, capacity: usize) -> Box; - - /// Returns true if the data type is compatible with timestamp type so we can - /// use it as a timestamp. - fn is_timestamp_compatible(&self) -> bool; -} - -pub type DataTypeRef = Arc; - -#[cfg(test)] -mod tests { - use arrow::datatypes::Field; - - use super::*; - - #[test] - fn test_concrete_type_as_datatype_trait() { - let concrete_type = ConcreteDataType::boolean_datatype(); - - assert_eq!("Boolean", concrete_type.name()); - assert_eq!(Value::Boolean(false), concrete_type.default_value()); - assert_eq!(LogicalTypeId::Boolean, concrete_type.logical_type_id()); - assert_eq!(ArrowDataType::Boolean, concrete_type.as_arrow_type()); - } - - #[test] - fn test_from_arrow_type() { - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Null), - ConcreteDataType::Null(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Boolean), - ConcreteDataType::Boolean(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Binary), - ConcreteDataType::Binary(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::LargeBinary), - ConcreteDataType::Binary(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int8), - ConcreteDataType::Int8(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int16), - ConcreteDataType::Int16(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int32), - ConcreteDataType::Int32(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Int64), - ConcreteDataType::Int64(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt8), - ConcreteDataType::UInt8(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt16), - ConcreteDataType::UInt16(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt32), - ConcreteDataType::UInt32(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::UInt64), - ConcreteDataType::UInt64(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Float32), - ConcreteDataType::Float32(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Float64), - ConcreteDataType::Float64(_) - )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), - ConcreteDataType::String(_) - )); - assert_eq!( - ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( - "item", - ArrowDataType::Int32, - true, - )))), - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())) - ); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Date32), - ConcreteDataType::Date(_) - )); - } - - #[test] - fn test_from_arrow_timestamp() { - assert_eq!( - ConcreteDataType::timestamp_millisecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) - ); - assert_eq!( - ConcreteDataType::timestamp_microsecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) - ); - assert_eq!( - ConcreteDataType::timestamp_nanosecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) - ); - assert_eq!( - ConcreteDataType::timestamp_second_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) - ); - } - - #[test] - fn test_is_timestamp_compatible() { - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() - ); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() - ); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() - ); - assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); - } - - #[test] - fn test_is_null() { - assert!(ConcreteDataType::null_datatype().is_null()); - assert!(!ConcreteDataType::int32_datatype().is_null()); - } - - #[test] - fn test_is_float() { - assert!(!ConcreteDataType::int32_datatype().is_float()); - assert!(ConcreteDataType::float32_datatype().is_float()); - assert!(ConcreteDataType::float64_datatype().is_float()); - } - - #[test] - fn test_is_boolean() { - assert!(!ConcreteDataType::int32_datatype().is_boolean()); - assert!(!ConcreteDataType::float32_datatype().is_boolean()); - assert!(ConcreteDataType::boolean_datatype().is_boolean()); - } - - #[test] - fn test_is_stringifiable() { - assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); - assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); - assert!(ConcreteDataType::string_datatype().is_stringifiable()); - assert!(ConcreteDataType::date_datatype().is_stringifiable()); - assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); - } - - #[test] - fn test_is_signed() { - assert!(ConcreteDataType::int8_datatype().is_signed()); - assert!(ConcreteDataType::int16_datatype().is_signed()); - assert!(ConcreteDataType::int32_datatype().is_signed()); - assert!(ConcreteDataType::int64_datatype().is_signed()); - assert!(ConcreteDataType::date_datatype().is_signed()); - assert!(ConcreteDataType::datetime_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); - - assert!(!ConcreteDataType::uint8_datatype().is_signed()); - assert!(!ConcreteDataType::uint16_datatype().is_signed()); - assert!(!ConcreteDataType::uint32_datatype().is_signed()); - assert!(!ConcreteDataType::uint64_datatype().is_signed()); - - assert!(!ConcreteDataType::float32_datatype().is_signed()); - assert!(!ConcreteDataType::float64_datatype().is_signed()); - } - - #[test] - fn test_is_unsigned() { - assert!(!ConcreteDataType::int8_datatype().is_unsigned()); - assert!(!ConcreteDataType::int16_datatype().is_unsigned()); - assert!(!ConcreteDataType::int32_datatype().is_unsigned()); - assert!(!ConcreteDataType::int64_datatype().is_unsigned()); - assert!(!ConcreteDataType::date_datatype().is_unsigned()); - assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); - - assert!(ConcreteDataType::uint8_datatype().is_unsigned()); - assert!(ConcreteDataType::uint16_datatype().is_unsigned()); - assert!(ConcreteDataType::uint32_datatype().is_unsigned()); - assert!(ConcreteDataType::uint64_datatype().is_unsigned()); - - assert!(!ConcreteDataType::float32_datatype().is_unsigned()); - assert!(!ConcreteDataType::float64_datatype().is_unsigned()); - } - - #[test] - fn test_numerics() { - let nums = ConcreteDataType::numerics(); - assert_eq!(10, nums.len()); - } -} diff --git a/src/datatypes2/src/error.rs b/src/datatypes2/src/error.rs deleted file mode 100644 index 50b49cf2b4..0000000000 --- a/src/datatypes2/src/error.rs +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use common_error::prelude::{ErrorCompat, ErrorExt, Snafu, StatusCode}; -use snafu::Backtrace; - -#[derive(Debug, Snafu)] -#[snafu(visibility(pub))] -pub enum Error { - #[snafu(display("Failed to serialize data, source: {}", source))] - Serialize { - source: serde_json::Error, - backtrace: Backtrace, - }, - - #[snafu(display("Failed to deserialize data, source: {}, json: {}", source, json))] - Deserialize { - source: serde_json::Error, - backtrace: Backtrace, - json: String, - }, - - #[snafu(display("Failed to convert datafusion type: {}", from))] - Conversion { from: String, backtrace: Backtrace }, - - #[snafu(display("Bad array access, Index out of bounds: {}, size: {}", index, size))] - BadArrayAccess { - index: usize, - size: usize, - backtrace: Backtrace, - }, - - #[snafu(display("Unknown vector, {}", msg))] - UnknownVector { msg: String, backtrace: Backtrace }, - - #[snafu(display("Unsupported arrow data type, type: {:?}", arrow_type))] - UnsupportedArrowType { - arrow_type: arrow::datatypes::DataType, - backtrace: Backtrace, - }, - - #[snafu(display("Timestamp column {} not found", name,))] - TimestampNotFound { name: String, backtrace: Backtrace }, - - #[snafu(display( - "Failed to parse version in schema meta, value: {}, source: {}", - value, - source - ))] - ParseSchemaVersion { - value: String, - source: std::num::ParseIntError, - backtrace: Backtrace, - }, - - #[snafu(display("Invalid timestamp index: {}", index))] - InvalidTimestampIndex { index: usize, backtrace: Backtrace }, - - #[snafu(display("Duplicate timestamp index, exists: {}, new: {}", exists, new))] - DuplicateTimestampIndex { - exists: usize, - new: usize, - backtrace: Backtrace, - }, - - #[snafu(display("{}", msg))] - CastType { msg: String, backtrace: Backtrace }, - - #[snafu(display("Arrow failed to compute, source: {}", source))] - ArrowCompute { - source: arrow::error::ArrowError, - backtrace: Backtrace, - }, - - #[snafu(display("Unsupported column default constraint expression: {}", expr))] - UnsupportedDefaultExpr { expr: String, backtrace: Backtrace }, - - #[snafu(display("Default value should not be null for non null column"))] - NullDefault { backtrace: Backtrace }, - - #[snafu(display("Incompatible default value type, reason: {}", reason))] - DefaultValueType { - reason: String, - backtrace: Backtrace, - }, - - #[snafu(display("Duplicated metadata for {}", key))] - DuplicateMeta { key: String, backtrace: Backtrace }, -} - -impl ErrorExt for Error { - fn status_code(&self) -> StatusCode { - // Inner encoding and decoding error should not be exposed to users. - StatusCode::Internal - } - - fn backtrace_opt(&self) -> Option<&Backtrace> { - ErrorCompat::backtrace(self) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -pub type Result = std::result::Result; - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - - use snafu::ResultExt; - - use super::*; - - #[test] - pub fn test_error() { - let mut map = HashMap::new(); - map.insert(true, 1); - map.insert(false, 2); - - let result = serde_json::to_string(&map).context(SerializeSnafu); - assert!(result.is_err(), "serialize result is: {:?}", result); - let err = serde_json::to_string(&map) - .context(SerializeSnafu) - .err() - .unwrap(); - assert!(err.backtrace_opt().is_some()); - assert_eq!(StatusCode::Internal, err.status_code()); - } -} diff --git a/src/datatypes2/src/lib.rs b/src/datatypes2/src/lib.rs deleted file mode 100644 index 256d347eac..0000000000 --- a/src/datatypes2/src/lib.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#![feature(generic_associated_types)] -#![feature(assert_matches)] - -pub mod arrow_array; -pub mod data_type; -pub mod error; -pub mod macros; -pub mod prelude; -mod scalars; -pub mod schema; -pub mod serialize; -mod timestamp; -pub mod type_id; -pub mod types; -pub mod value; -pub mod vectors; - -pub use arrow; -pub use error::{Error, Result}; diff --git a/src/datatypes2/src/macros.rs b/src/datatypes2/src/macros.rs deleted file mode 100644 index 37c0a42e3f..0000000000 --- a/src/datatypes2/src/macros.rs +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Some helper macros for datatypes, copied from databend. - -/// Apply the macro rules to all primitive types. -#[macro_export] -macro_rules! for_all_primitive_types { - ($macro:tt $(, $x:tt)*) => { - $macro! { - [$($x),*], - { i8 }, - { i16 }, - { i32 }, - { i64 }, - { u8 }, - { u16 }, - { u32 }, - { u64 }, - { f32 }, - { f64 } - } - }; -} - -/// Match the logical type and apply `$body` to all primitive types and -/// `nbody` to other types. -#[macro_export] -macro_rules! with_match_primitive_type_id { - ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ - macro_rules! __with_ty__ { - ( $_ $T:ident ) => { - $body - }; - } - - use $crate::type_id::LogicalTypeId; - use $crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, - }; - match $key_type { - LogicalTypeId::Int8 => __with_ty__! { Int8Type }, - LogicalTypeId::Int16 => __with_ty__! { Int16Type }, - LogicalTypeId::Int32 => __with_ty__! { Int32Type }, - LogicalTypeId::Int64 => __with_ty__! { Int64Type }, - LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, - LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, - LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, - LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, - LogicalTypeId::Float32 => __with_ty__! { Float32Type }, - LogicalTypeId::Float64 => __with_ty__! { Float64Type }, - - _ => $nbody, - } - }}; -} diff --git a/src/datatypes2/src/prelude.rs b/src/datatypes2/src/prelude.rs deleted file mode 100644 index f6bd298316..0000000000 --- a/src/datatypes2/src/prelude.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; -pub use crate::macros::*; -pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; -pub use crate::type_id::LogicalTypeId; -pub use crate::value::{Value, ValueRef}; -pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; diff --git a/src/datatypes2/src/scalars.rs b/src/datatypes2/src/scalars.rs deleted file mode 100644 index 327ebaa629..0000000000 --- a/src/datatypes2/src/scalars.rs +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use common_time::{Date, DateTime}; - -use crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, - UInt64Type, UInt8Type, -}; -use crate::value::{ListValue, ListValueRef, Value}; -use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, - PrimitiveVector, StringVector, Vector, -}; - -fn get_iter_capacity>(iter: &I) -> usize { - match iter.size_hint() { - (_lower, Some(upper)) => upper, - (0, None) => 1024, - (lower, None) => lower, - } -} - -/// Owned scalar value -/// primitive types, bool, Vec ... -pub trait Scalar: 'static + Sized + Default + Any -where - for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, -{ - type VectorType: ScalarVector; - type RefType<'a>: ScalarRef<'a, ScalarType = Self> - where - Self: 'a; - /// Get a reference of the current value. - fn as_scalar_ref(&self) -> Self::RefType<'_>; - - /// Upcast GAT type's lifetime. - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short>; -} - -pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { - /// The corresponding [`Scalar`] type. - type ScalarType: Scalar = Self>; - - /// Convert the reference into an owned value. - fn to_owned_scalar(&self) -> Self::ScalarType; -} - -/// A sub trait of Vector to add scalar operation support. -// This implementation refers to Datebend's [ScalarColumn](https://github.com/datafuselabs/databend/blob/main/common/datavalues/src/scalars/type_.rs) -// and skyzh's [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust). -pub trait ScalarVector: Vector + Send + Sync + Sized + 'static -where - for<'a> Self::OwnedItem: Scalar = Self::RefItem<'a>>, -{ - type OwnedItem: Scalar; - /// The reference item of this vector. - type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> - where - Self: 'a; - - /// Iterator type of this vector. - type Iter<'a>: Iterator>> - where - Self: 'a; - - /// Builder type to build this vector. - type Builder: ScalarVectorBuilder; - - /// Returns the reference to an element at given position. - /// - /// Note: `get()` has bad performance, avoid call this function inside loop. - /// - /// # Panics - /// Panics if `idx >= self.len()`. - fn get_data(&self, idx: usize) -> Option>; - - /// Returns iterator of current vector. - fn iter_data(&self) -> Self::Iter<'_>; - - fn from_slice(data: &[Self::RefItem<'_>]) -> Self { - let mut builder = Self::Builder::with_capacity(data.len()); - for item in data { - builder.push(Some(*item)); - } - builder.finish() - } - - fn from_iterator<'a>(it: impl Iterator>) -> Self { - let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); - for item in it { - builder.push(Some(item)); - } - builder.finish() - } - - fn from_owned_iterator(it: impl Iterator>) -> Self { - let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); - for item in it { - match item { - Some(item) => builder.push(Some(item.as_scalar_ref())), - None => builder.push(None), - } - } - builder.finish() - } - - fn from_vec>(values: Vec) -> Self { - let it = values.into_iter(); - let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); - for item in it { - builder.push(Some(item.into().as_scalar_ref())); - } - builder.finish() - } -} - -/// A trait over all vector builders. -pub trait ScalarVectorBuilder: MutableVector { - type VectorType: ScalarVector; - - /// Create a new builder with initial `capacity`. - fn with_capacity(capacity: usize) -> Self; - - /// Push a value into the builder. - fn push(&mut self, value: Option<::RefItem<'_>>); - - /// Finish build and return a new vector. - fn finish(&mut self) -> Self::VectorType; -} - -macro_rules! impl_scalar_for_native { - ($Native: ident, $DataType: ident) => { - impl Scalar for $Native { - type VectorType = PrimitiveVector<$DataType>; - type RefType<'a> = $Native; - - #[inline] - fn as_scalar_ref(&self) -> $Native { - *self - } - - #[allow(clippy::needless_lifetimes)] - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { - long - } - } - - /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. - impl<'a> ScalarRef<'a> for $Native { - type ScalarType = $Native; - - #[inline] - fn to_owned_scalar(&self) -> $Native { - *self - } - } - }; -} - -impl_scalar_for_native!(u8, UInt8Type); -impl_scalar_for_native!(u16, UInt16Type); -impl_scalar_for_native!(u32, UInt32Type); -impl_scalar_for_native!(u64, UInt64Type); -impl_scalar_for_native!(i8, Int8Type); -impl_scalar_for_native!(i16, Int16Type); -impl_scalar_for_native!(i32, Int32Type); -impl_scalar_for_native!(i64, Int64Type); -impl_scalar_for_native!(f32, Float32Type); -impl_scalar_for_native!(f64, Float64Type); - -impl Scalar for bool { - type VectorType = BooleanVector; - type RefType<'a> = bool; - - #[inline] - fn as_scalar_ref(&self) -> bool { - *self - } - - #[allow(clippy::needless_lifetimes)] - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: bool) -> bool { - long - } -} - -impl<'a> ScalarRef<'a> for bool { - type ScalarType = bool; - - #[inline] - fn to_owned_scalar(&self) -> bool { - *self - } -} - -impl Scalar for String { - type VectorType = StringVector; - type RefType<'a> = &'a str; - - #[inline] - fn as_scalar_ref(&self) -> &str { - self - } - - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: &'long str) -> &'short str { - long - } -} - -impl<'a> ScalarRef<'a> for &'a str { - type ScalarType = String; - - #[inline] - fn to_owned_scalar(&self) -> String { - self.to_string() - } -} - -impl Scalar for Vec { - type VectorType = BinaryVector; - type RefType<'a> = &'a [u8]; - - #[inline] - fn as_scalar_ref(&self) -> &[u8] { - self - } - - #[inline] - fn upcast_gat<'short, 'long: 'short>(long: &'long [u8]) -> &'short [u8] { - long - } -} - -impl<'a> ScalarRef<'a> for &'a [u8] { - type ScalarType = Vec; - - #[inline] - fn to_owned_scalar(&self) -> Vec { - self.to_vec() - } -} - -impl Scalar for Date { - type VectorType = DateVector; - type RefType<'a> = Date; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for Date { - type ScalarType = Date; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} - -impl Scalar for DateTime { - type VectorType = DateTimeVector; - type RefType<'a> = DateTime; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for DateTime { - type ScalarType = DateTime; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} - -// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. - -impl Scalar for ListValue { - type VectorType = ListVector; - type RefType<'a> = ListValueRef<'a>; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - ListValueRef::Ref { val: self } - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for ListValueRef<'a> { - type ScalarType = ListValue; - - fn to_owned_scalar(&self) -> Self::ScalarType { - match self { - ListValueRef::Indexed { vector, idx } => match vector.get(*idx) { - // Normally should not get `Value::Null` if the `ListValueRef` comes - // from the iterator of the ListVector, but we avoid panic and just - // returns a default list value in such case since `ListValueRef` may - // be constructed manually. - Value::Null => ListValue::default(), - Value::List(v) => v, - _ => unreachable!(), - }, - ListValueRef::Ref { val } => (*val).clone(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::ConcreteDataType; - use crate::timestamp::TimestampSecond; - use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; - - fn build_vector_from_slice(items: &[Option>]) -> T { - let mut builder = T::Builder::with_capacity(items.len()); - for item in items { - builder.push(*item); - } - builder.finish() - } - - fn assert_vector_eq<'a, T: ScalarVector>(expect: &[Option>], vector: &'a T) - where - T::RefItem<'a>: PartialEq + std::fmt::Debug, - { - for (a, b) in expect.iter().zip(vector.iter_data()) { - assert_eq!(*a, b); - } - } - - #[test] - fn test_build_i32_vector() { - let expect = vec![Some(1), Some(2), Some(3), None, Some(5)]; - let vector: Int32Vector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - } - - #[test] - fn test_build_binary_vector() { - let expect: Vec> = vec![ - Some(b"a"), - Some(b"b"), - Some(b"c"), - None, - Some(b"e"), - Some(b""), - ]; - let vector: BinaryVector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - } - - #[test] - fn test_build_date_vector() { - let expect: Vec> = vec![ - Some(Date::new(0)), - Some(Date::new(-1)), - None, - Some(Date::new(1)), - ]; - let vector: DateVector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - } - - #[test] - fn test_date_scalar() { - let date = Date::new(1); - assert_eq!(date, date.as_scalar_ref()); - assert_eq!(date, date.to_owned_scalar()); - } - - #[test] - fn test_datetime_scalar() { - let dt = DateTime::new(123); - assert_eq!(dt, dt.as_scalar_ref()); - assert_eq!(dt, dt.to_owned_scalar()); - } - - #[test] - fn test_list_value_scalar() { - let list_value = ListValue::new( - Some(Box::new(vec![Value::Int32(123)])), - ConcreteDataType::int32_datatype(), - ); - let list_ref = ListValueRef::Ref { val: &list_value }; - assert_eq!(list_ref, list_value.as_scalar_ref()); - assert_eq!(list_value, list_ref.to_owned_scalar()); - - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 1); - builder.push(None); - builder.push(Some(list_value.as_scalar_ref())); - let vector = builder.finish(); - - let ref_on_vec = ListValueRef::Indexed { - vector: &vector, - idx: 0, - }; - assert_eq!(ListValue::default(), ref_on_vec.to_owned_scalar()); - let ref_on_vec = ListValueRef::Indexed { - vector: &vector, - idx: 1, - }; - assert_eq!(list_value, ref_on_vec.to_owned_scalar()); - } - - #[test] - fn test_build_timestamp_vector() { - let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; - let vector: TimestampSecondVector = build_vector_from_slice(&expect); - assert_vector_eq(&expect, &vector); - let val = vector.get_data(0).unwrap(); - assert_eq!(val, val.as_scalar_ref()); - assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); - } -} diff --git a/src/datatypes2/src/schema.rs b/src/datatypes2/src/schema.rs deleted file mode 100644 index 328fe0de24..0000000000 --- a/src/datatypes2/src/schema.rs +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod column_schema; -mod constraint; -mod raw; - -use std::collections::HashMap; -use std::sync::Arc; - -use arrow::datatypes::{Field, Schema as ArrowSchema}; -use snafu::{ensure, ResultExt}; - -use crate::data_type::DataType; -use crate::error::{self, Error, Result}; -pub use crate::schema::column_schema::{ColumnSchema, Metadata}; -pub use crate::schema::constraint::ColumnDefaultConstraint; -pub use crate::schema::raw::RawSchema; - -/// Key used to store version number of the schema in metadata. -const VERSION_KEY: &str = "greptime:version"; - -/// A common schema, should be immutable. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Schema { - column_schemas: Vec, - name_to_index: HashMap, - arrow_schema: Arc, - /// Index of the timestamp key column. - /// - /// Timestamp key column is the column holds the timestamp and forms part of - /// the primary key. None means there is no timestamp key column. - timestamp_index: Option, - /// Version of the schema. - /// - /// Initial value is zero. The version should bump after altering schema. - version: u32, -} - -impl Schema { - /// Initial version of the schema. - pub const INITIAL_VERSION: u32 = 0; - - /// Create a schema from a vector of [ColumnSchema]. - /// - /// # Panics - /// Panics when ColumnSchema's `default_constraint` can't be serialized into json. - pub fn new(column_schemas: Vec) -> Schema { - // Builder won't fail in this case - SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .unwrap() - } - - /// Try to Create a schema from a vector of [ColumnSchema]. - pub fn try_new(column_schemas: Vec) -> Result { - SchemaBuilder::try_from(column_schemas)?.build() - } - - #[inline] - pub fn arrow_schema(&self) -> &Arc { - &self.arrow_schema - } - - #[inline] - pub fn column_schemas(&self) -> &[ColumnSchema] { - &self.column_schemas - } - - pub fn column_schema_by_name(&self, name: &str) -> Option<&ColumnSchema> { - self.name_to_index - .get(name) - .map(|index| &self.column_schemas[*index]) - } - - /// Retrieve the column's name by index - /// # Panics - /// This method **may** panic if the index is out of range of column schemas. - #[inline] - pub fn column_name_by_index(&self, idx: usize) -> &str { - &self.column_schemas[idx].name - } - - #[inline] - pub fn column_index_by_name(&self, name: &str) -> Option { - self.name_to_index.get(name).copied() - } - - #[inline] - pub fn contains_column(&self, name: &str) -> bool { - self.name_to_index.contains_key(name) - } - - #[inline] - pub fn num_columns(&self) -> usize { - self.column_schemas.len() - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.column_schemas.is_empty() - } - - /// Returns index of the timestamp key column. - #[inline] - pub fn timestamp_index(&self) -> Option { - self.timestamp_index - } - - #[inline] - pub fn timestamp_column(&self) -> Option<&ColumnSchema> { - self.timestamp_index.map(|idx| &self.column_schemas[idx]) - } - - #[inline] - pub fn version(&self) -> u32 { - self.version - } - - #[inline] - pub fn metadata(&self) -> &HashMap { - &self.arrow_schema.metadata - } -} - -#[derive(Default)] -pub struct SchemaBuilder { - column_schemas: Vec, - name_to_index: HashMap, - fields: Vec, - timestamp_index: Option, - version: u32, - metadata: HashMap, -} - -impl TryFrom> for SchemaBuilder { - type Error = Error; - - fn try_from(column_schemas: Vec) -> Result { - SchemaBuilder::try_from_columns(column_schemas) - } -} - -impl SchemaBuilder { - pub fn try_from_columns(column_schemas: Vec) -> Result { - let FieldsAndIndices { - fields, - name_to_index, - timestamp_index, - } = collect_fields(&column_schemas)?; - - Ok(Self { - column_schemas, - name_to_index, - fields, - timestamp_index, - ..Default::default() - }) - } - - pub fn version(mut self, version: u32) -> Self { - self.version = version; - self - } - - /// Add key value pair to metadata. - /// - /// Old metadata with same key would be overwritten. - pub fn add_metadata(mut self, key: impl Into, value: impl Into) -> Self { - self.metadata.insert(key.into(), value.into()); - self - } - - pub fn build(mut self) -> Result { - if let Some(timestamp_index) = self.timestamp_index { - validate_timestamp_index(&self.column_schemas, timestamp_index)?; - } - - self.metadata - .insert(VERSION_KEY.to_string(), self.version.to_string()); - - let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); - - Ok(Schema { - column_schemas: self.column_schemas, - name_to_index: self.name_to_index, - arrow_schema: Arc::new(arrow_schema), - timestamp_index: self.timestamp_index, - version: self.version, - }) - } -} - -struct FieldsAndIndices { - fields: Vec, - name_to_index: HashMap, - timestamp_index: Option, -} - -fn collect_fields(column_schemas: &[ColumnSchema]) -> Result { - let mut fields = Vec::with_capacity(column_schemas.len()); - let mut name_to_index = HashMap::with_capacity(column_schemas.len()); - let mut timestamp_index = None; - for (index, column_schema) in column_schemas.iter().enumerate() { - if column_schema.is_time_index() { - ensure!( - timestamp_index.is_none(), - error::DuplicateTimestampIndexSnafu { - exists: timestamp_index.unwrap(), - new: index, - } - ); - timestamp_index = Some(index); - } - let field = Field::try_from(column_schema)?; - fields.push(field); - name_to_index.insert(column_schema.name.clone(), index); - } - - Ok(FieldsAndIndices { - fields, - name_to_index, - timestamp_index, - }) -} - -fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: usize) -> Result<()> { - ensure!( - timestamp_index < column_schemas.len(), - error::InvalidTimestampIndexSnafu { - index: timestamp_index, - } - ); - - let column_schema = &column_schemas[timestamp_index]; - ensure!( - column_schema.data_type.is_timestamp_compatible(), - error::InvalidTimestampIndexSnafu { - index: timestamp_index, - } - ); - ensure!( - column_schema.is_time_index(), - error::InvalidTimestampIndexSnafu { - index: timestamp_index, - } - ); - - Ok(()) -} - -pub type SchemaRef = Arc; - -impl TryFrom> for Schema { - type Error = Error; - - fn try_from(arrow_schema: Arc) -> Result { - let mut column_schemas = Vec::with_capacity(arrow_schema.fields.len()); - let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); - for field in &arrow_schema.fields { - let column_schema = ColumnSchema::try_from(field)?; - name_to_index.insert(field.name().to_string(), column_schemas.len()); - column_schemas.push(column_schema); - } - - let mut timestamp_index = None; - for (index, column_schema) in column_schemas.iter().enumerate() { - if column_schema.is_time_index() { - validate_timestamp_index(&column_schemas, index)?; - ensure!( - timestamp_index.is_none(), - error::DuplicateTimestampIndexSnafu { - exists: timestamp_index.unwrap(), - new: index, - } - ); - timestamp_index = Some(index); - } - } - - let version = try_parse_version(&arrow_schema.metadata, VERSION_KEY)?; - - Ok(Self { - column_schemas, - name_to_index, - arrow_schema, - timestamp_index, - version, - }) - } -} - -impl TryFrom for Schema { - type Error = Error; - - fn try_from(arrow_schema: ArrowSchema) -> Result { - let arrow_schema = Arc::new(arrow_schema); - - Schema::try_from(arrow_schema) - } -} - -fn try_parse_version(metadata: &HashMap, key: &str) -> Result { - if let Some(value) = metadata.get(key) { - let version = value - .parse() - .context(error::ParseSchemaVersionSnafu { value })?; - - Ok(version) - } else { - Ok(Schema::INITIAL_VERSION) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::ConcreteDataType; - - #[test] - fn test_build_empty_schema() { - let schema = SchemaBuilder::default().build().unwrap(); - assert_eq!(0, schema.num_columns()); - assert!(schema.is_empty()); - } - - #[test] - fn test_schema_no_timestamp() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), false), - ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), true), - ]; - let schema = Schema::new(column_schemas.clone()); - - assert_eq!(2, schema.num_columns()); - assert!(!schema.is_empty()); - assert!(schema.timestamp_index().is_none()); - assert!(schema.timestamp_column().is_none()); - assert_eq!(Schema::INITIAL_VERSION, schema.version()); - - for column_schema in &column_schemas { - let found = schema.column_schema_by_name(&column_schema.name).unwrap(); - assert_eq!(column_schema, found); - } - assert!(schema.column_schema_by_name("col3").is_none()); - - let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap(); - - assert_eq!(schema, new_schema); - assert_eq!(column_schemas, schema.column_schemas()); - } - - #[test] - fn test_metadata() { - let column_schemas = vec![ColumnSchema::new( - "col1", - ConcreteDataType::int32_datatype(), - false, - )]; - let schema = SchemaBuilder::try_from(column_schemas) - .unwrap() - .add_metadata("k1", "v1") - .build() - .unwrap(); - - assert_eq!("v1", schema.metadata().get("k1").unwrap()); - } - - #[test] - fn test_schema_with_timestamp() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), - ]; - let schema = SchemaBuilder::try_from(column_schemas.clone()) - .unwrap() - .version(123) - .build() - .unwrap(); - - assert_eq!(1, schema.timestamp_index().unwrap()); - assert_eq!(&column_schemas[1], schema.timestamp_column().unwrap()); - assert_eq!(123, schema.version()); - - let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap(); - assert_eq!(1, schema.timestamp_index().unwrap()); - assert_eq!(schema, new_schema); - } - - #[test] - fn test_schema_wrong_timestamp() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true) - .with_time_index(true), - ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false), - ]; - assert!(SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .is_err()); - - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false) - .with_time_index(true), - ]; - - assert!(SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .is_err()); - } -} diff --git a/src/datatypes2/src/schema/constraint.rs b/src/datatypes2/src/schema/constraint.rs deleted file mode 100644 index 4dd3ecc14b..0000000000 --- a/src/datatypes2/src/schema/constraint.rs +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt::{Display, Formatter}; -use std::sync::Arc; - -use common_time::util; -use serde::{Deserialize, Serialize}; -use snafu::{ensure, ResultExt}; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, Result}; -use crate::value::Value; -use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; - -const CURRENT_TIMESTAMP: &str = "current_timestamp()"; - -/// Column's default constraint. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum ColumnDefaultConstraint { - // A function invocation - // TODO(dennis): we save the function expression here, maybe use a struct in future. - Function(String), - // A value - Value(Value), -} - -impl TryFrom<&[u8]> for ColumnDefaultConstraint { - type Error = error::Error; - - fn try_from(bytes: &[u8]) -> Result { - let json = String::from_utf8_lossy(bytes); - serde_json::from_str(&json).context(error::DeserializeSnafu { json }) - } -} - -impl TryFrom for Vec { - type Error = error::Error; - - fn try_from(value: ColumnDefaultConstraint) -> std::result::Result { - let s = serde_json::to_string(&value).context(error::SerializeSnafu)?; - Ok(s.into_bytes()) - } -} - -impl Display for ColumnDefaultConstraint { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - ColumnDefaultConstraint::Function(expr) => write!(f, "{}", expr), - ColumnDefaultConstraint::Value(v) => write!(f, "{}", v), - } - } -} - -impl ColumnDefaultConstraint { - /// Returns a default null constraint. - pub fn null_value() -> ColumnDefaultConstraint { - ColumnDefaultConstraint::Value(Value::Null) - } - - /// Check whether the constraint is valid for columns with given `data_type` - /// and `is_nullable` attributes. - pub fn validate(&self, data_type: &ConcreteDataType, is_nullable: bool) -> Result<()> { - ensure!(is_nullable || !self.maybe_null(), error::NullDefaultSnafu); - - match self { - ColumnDefaultConstraint::Function(expr) => { - ensure!( - expr == CURRENT_TIMESTAMP, - error::UnsupportedDefaultExprSnafu { expr } - ); - ensure!( - data_type.is_timestamp_compatible(), - error::DefaultValueTypeSnafu { - reason: "return value of the function must has timestamp type", - } - ); - } - ColumnDefaultConstraint::Value(v) => { - if !v.is_null() { - // Whether the value could be nullable has been checked before, only need - // to check the type compatibility here. - ensure!( - data_type.logical_type_id() == v.logical_type_id(), - error::DefaultValueTypeSnafu { - reason: format!( - "column has type {:?} but default value has type {:?}", - data_type.logical_type_id(), - v.logical_type_id() - ), - } - ); - } - } - } - - Ok(()) - } - - /// Create a vector that contains `num_rows` default values for given `data_type`. - /// - /// If `is_nullable` is `true`, then this method would returns error if the created - /// default value is null. - /// - /// # Panics - /// Panics if `num_rows == 0`. - pub fn create_default_vector( - &self, - data_type: &ConcreteDataType, - is_nullable: bool, - num_rows: usize, - ) -> Result { - assert!(num_rows > 0); - - match self { - ColumnDefaultConstraint::Function(expr) => { - // Functions should also ensure its return value is not null when - // is_nullable is true. - match &expr[..] { - // TODO(dennis): we only supports current_timestamp right now, - // it's better to use a expression framework in future. - CURRENT_TIMESTAMP => create_current_timestamp_vector(data_type, num_rows), - _ => error::UnsupportedDefaultExprSnafu { expr }.fail(), - } - } - ColumnDefaultConstraint::Value(v) => { - ensure!(is_nullable || !v.is_null(), error::NullDefaultSnafu); - - // TODO(yingwen): - // 1. For null value, we could use NullVector once it supports custom logical type. - // 2. For non null value, we could use ConstantVector, but it would cause all codes - // attempt to downcast the vector fail if they don't check whether the vector is const - // first. - let mut mutable_vector = data_type.create_mutable_vector(1); - mutable_vector.push_value_ref(v.as_value_ref())?; - let base_vector = mutable_vector.to_vector(); - Ok(base_vector.replicate(&[num_rows])) - } - } - } - - /// Returns true if this constraint might creates NULL. - fn maybe_null(&self) -> bool { - // Once we support more functions, we may return true if given function - // could return null. - matches!(self, ColumnDefaultConstraint::Value(Value::Null)) - } -} - -fn create_current_timestamp_vector( - data_type: &ConcreteDataType, - num_rows: usize, -) -> Result { - // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector - // to other data type and avoid this match. - match data_type { - ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( - std::iter::repeat(util::current_time_millis()).take(num_rows), - ))), - ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( - std::iter::repeat(util::current_time_millis()).take(num_rows), - ))), - _ => error::DefaultValueTypeSnafu { - reason: format!( - "Not support to assign current timestamp to {:?} type", - data_type - ), - } - .fail(), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::error::Error; - use crate::vectors::Int32Vector; - - #[test] - fn test_null_default_constraint() { - let constraint = ColumnDefaultConstraint::null_value(); - assert!(constraint.maybe_null()); - let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); - assert!(!constraint.maybe_null()); - } - - #[test] - fn test_validate_null_constraint() { - let constraint = ColumnDefaultConstraint::null_value(); - let data_type = ConcreteDataType::int32_datatype(); - constraint.validate(&data_type, false).unwrap_err(); - constraint.validate(&data_type, true).unwrap(); - } - - #[test] - fn test_validate_value_constraint() { - let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); - let data_type = ConcreteDataType::int32_datatype(); - constraint.validate(&data_type, false).unwrap(); - constraint.validate(&data_type, true).unwrap(); - - constraint - .validate(&ConcreteDataType::uint32_datatype(), true) - .unwrap_err(); - } - - #[test] - fn test_validate_function_constraint() { - let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); - constraint - .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) - .unwrap(); - constraint - .validate(&ConcreteDataType::boolean_datatype(), false) - .unwrap_err(); - - let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); - constraint - .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) - .unwrap_err(); - } - - #[test] - fn test_create_default_vector_by_null() { - let constraint = ColumnDefaultConstraint::null_value(); - let data_type = ConcreteDataType::int32_datatype(); - constraint - .create_default_vector(&data_type, false, 10) - .unwrap_err(); - - let constraint = ColumnDefaultConstraint::null_value(); - let v = constraint - .create_default_vector(&data_type, true, 3) - .unwrap(); - assert_eq!(3, v.len()); - for i in 0..v.len() { - assert_eq!(Value::Null, v.get(i)); - } - } - - #[test] - fn test_create_default_vector_by_value() { - let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); - let data_type = ConcreteDataType::int32_datatype(); - let v = constraint - .create_default_vector(&data_type, false, 4) - .unwrap(); - let expect: VectorRef = Arc::new(Int32Vector::from_values(vec![10; 4])); - assert_eq!(expect, v); - } - - #[test] - fn test_create_default_vector_by_func() { - let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); - // Timestamp type. - let data_type = ConcreteDataType::timestamp_millisecond_datatype(); - let v = constraint - .create_default_vector(&data_type, false, 4) - .unwrap(); - assert_eq!(4, v.len()); - assert!( - matches!(v.get(0), Value::Timestamp(_)), - "v {:?} is not timestamp", - v.get(0) - ); - - // Int64 type. - let data_type = ConcreteDataType::int64_datatype(); - let v = constraint - .create_default_vector(&data_type, false, 4) - .unwrap(); - assert_eq!(4, v.len()); - assert!( - matches!(v.get(0), Value::Int64(_)), - "v {:?} is not timestamp", - v.get(0) - ); - - let constraint = ColumnDefaultConstraint::Function("no".to_string()); - let data_type = ConcreteDataType::timestamp_millisecond_datatype(); - constraint - .create_default_vector(&data_type, false, 4) - .unwrap_err(); - } - - #[test] - fn test_create_by_func_and_invalid_type() { - let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); - let data_type = ConcreteDataType::boolean_datatype(); - let err = constraint - .create_default_vector(&data_type, false, 4) - .unwrap_err(); - assert!(matches!(err, Error::DefaultValueType { .. }), "{:?}", err); - } -} diff --git a/src/datatypes2/src/schema/raw.rs b/src/datatypes2/src/schema/raw.rs deleted file mode 100644 index 75f0853b4b..0000000000 --- a/src/datatypes2/src/schema/raw.rs +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use serde::{Deserialize, Serialize}; - -use crate::error::{Error, Result}; -use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; - -/// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). -/// -/// This struct only contains necessary data to recover the Schema. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct RawSchema { - pub column_schemas: Vec, - pub timestamp_index: Option, - pub version: u32, -} - -impl TryFrom for Schema { - type Error = Error; - - fn try_from(raw: RawSchema) -> Result { - SchemaBuilder::try_from(raw.column_schemas)? - .version(raw.version) - .build() - } -} - -impl From<&Schema> for RawSchema { - fn from(schema: &Schema) -> RawSchema { - RawSchema { - column_schemas: schema.column_schemas.clone(), - timestamp_index: schema.timestamp_index, - version: schema.version, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::data_type::ConcreteDataType; - - #[test] - fn test_raw_convert() { - let column_schemas = vec![ - ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), - ]; - let schema = SchemaBuilder::try_from(column_schemas) - .unwrap() - .version(123) - .build() - .unwrap(); - - let raw = RawSchema::from(&schema); - let schema_new = Schema::try_from(raw).unwrap(); - - assert_eq!(schema, schema_new); - } -} diff --git a/src/datatypes2/src/serialize.rs b/src/datatypes2/src/serialize.rs deleted file mode 100644 index 1cbf04cedd..0000000000 --- a/src/datatypes2/src/serialize.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::error::Result; - -pub trait Serializable: Send + Sync { - /// Serialize a column of value with given type to JSON value - fn serialize_to_json(&self) -> Result>; -} diff --git a/src/datatypes2/src/type_id.rs b/src/datatypes2/src/type_id.rs deleted file mode 100644 index bcb7ea52b1..0000000000 --- a/src/datatypes2/src/type_id.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/// Unique identifier for logical data type. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum LogicalTypeId { - Null, - - // Numeric types: - Boolean, - Int8, - Int16, - Int32, - Int64, - UInt8, - UInt16, - UInt32, - UInt64, - Float32, - Float64, - - // String types: - String, - Binary, - - // Date & Time types: - /// Date representing the elapsed time since UNIX epoch (1970-01-01) - /// in days (32 bits). - Date, - /// Datetime representing the elapsed time since UNIX epoch (1970-01-01) in - /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. - DateTime, - - TimestampSecond, - TimestampMillisecond, - TimestampMicrosecond, - TimestampNanosecond, - - List, -} - -impl LogicalTypeId { - /// Create ConcreteDataType based on this id. This method is for test only as it - /// would lost some info. - /// - /// # Panics - /// Panics if data type is not supported. - #[cfg(any(test, feature = "test"))] - pub fn data_type(&self) -> crate::data_type::ConcreteDataType { - use crate::data_type::ConcreteDataType; - - match self { - LogicalTypeId::Null => ConcreteDataType::null_datatype(), - LogicalTypeId::Boolean => ConcreteDataType::boolean_datatype(), - LogicalTypeId::Int8 => ConcreteDataType::int8_datatype(), - LogicalTypeId::Int16 => ConcreteDataType::int16_datatype(), - LogicalTypeId::Int32 => ConcreteDataType::int32_datatype(), - LogicalTypeId::Int64 => ConcreteDataType::int64_datatype(), - LogicalTypeId::UInt8 => ConcreteDataType::uint8_datatype(), - LogicalTypeId::UInt16 => ConcreteDataType::uint16_datatype(), - LogicalTypeId::UInt32 => ConcreteDataType::uint32_datatype(), - LogicalTypeId::UInt64 => ConcreteDataType::uint64_datatype(), - LogicalTypeId::Float32 => ConcreteDataType::float32_datatype(), - LogicalTypeId::Float64 => ConcreteDataType::float64_datatype(), - LogicalTypeId::String => ConcreteDataType::string_datatype(), - LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), - LogicalTypeId::Date => ConcreteDataType::date_datatype(), - LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), - LogicalTypeId::TimestampMillisecond => { - ConcreteDataType::timestamp_millisecond_datatype() - } - LogicalTypeId::TimestampMicrosecond => { - ConcreteDataType::timestamp_microsecond_datatype() - } - LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), - LogicalTypeId::List => { - ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) - } - } - } -} diff --git a/src/datatypes2/src/types.rs b/src/datatypes2/src/types.rs deleted file mode 100644 index 186704fdfd..0000000000 --- a/src/datatypes2/src/types.rs +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod binary_type; -mod boolean_type; -mod date_type; -mod datetime_type; -mod list_type; -mod null_type; -mod primitive_type; -mod string_type; - -mod timestamp_type; - -pub use binary_type::BinaryType; -pub use boolean_type::BooleanType; -pub use date_type::DateType; -pub use datetime_type::DateTimeType; -pub use list_type::ListType; -pub use null_type::NullType; -pub use primitive_type::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, - NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, -}; -pub use string_type::StringType; -pub use timestamp_type::*; diff --git a/src/datatypes2/src/types/binary_type.rs b/src/datatypes2/src/types/binary_type.rs deleted file mode 100644 index 0d06724fff..0000000000 --- a/src/datatypes2/src/types/binary_type.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use common_base::bytes::StringBytes; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::scalars::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{BinaryVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct BinaryType; - -impl BinaryType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} - -impl DataType for BinaryType { - fn name(&self) -> &str { - "Binary" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Binary - } - - fn default_value(&self) -> Value { - StringBytes::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::LargeBinary - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(BinaryVectorBuilder::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/types/boolean_type.rs b/src/datatypes2/src/types/boolean_type.rs deleted file mode 100644 index 36d92169eb..0000000000 --- a/src/datatypes2/src/types/boolean_type.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::scalars::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{BooleanVectorBuilder, MutableVector}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct BooleanType; - -impl BooleanType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} - -impl DataType for BooleanType { - fn name(&self) -> &str { - "Boolean" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Boolean - } - - fn default_value(&self) -> Value { - bool::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Boolean - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(BooleanVectorBuilder::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/types/list_type.rs b/src/datatypes2/src/types/list_type.rs deleted file mode 100644 index b9875ca362..0000000000 --- a/src/datatypes2/src/types/list_type.rs +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::datatypes::{DataType as ArrowDataType, Field}; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::type_id::LogicalTypeId; -use crate::value::{ListValue, Value}; -use crate::vectors::{ListVectorBuilder, MutableVector}; - -/// Used to represent the List datatype. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ListType { - /// The type of List's item. - // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. - item_type: Box, -} - -impl Default for ListType { - fn default() -> Self { - ListType::new(ConcreteDataType::null_datatype()) - } -} - -impl ListType { - /// Create a new `ListType` whose item's data type is `item_type`. - pub fn new(item_type: ConcreteDataType) -> Self { - ListType { - item_type: Box::new(item_type), - } - } -} - -impl DataType for ListType { - fn name(&self) -> &str { - "List" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::List - } - - fn default_value(&self) -> Value { - Value::List(ListValue::new(None, *self.item_type.clone())) - } - - fn as_arrow_type(&self) -> ArrowDataType { - let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); - ArrowDataType::List(field) - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(ListVectorBuilder::with_type_capacity( - *self.item_type.clone(), - capacity, - )) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::value::ListValue; - - #[test] - fn test_list_type() { - let t = ListType::new(ConcreteDataType::boolean_datatype()); - assert_eq!("List", t.name()); - assert_eq!(LogicalTypeId::List, t.logical_type_id()); - assert_eq!( - Value::List(ListValue::new(None, ConcreteDataType::boolean_datatype())), - t.default_value() - ); - assert_eq!( - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Boolean, true))), - t.as_arrow_type() - ); - } -} diff --git a/src/datatypes2/src/types/null_type.rs b/src/datatypes2/src/types/null_type.rs deleted file mode 100644 index b9bb2dc752..0000000000 --- a/src/datatypes2/src/types/null_type.rs +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{MutableVector, NullVectorBuilder}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct NullType; - -impl NullType { - pub fn arc() -> DataTypeRef { - Arc::new(NullType) - } -} - -impl DataType for NullType { - fn name(&self) -> &str { - "Null" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Null - } - - fn default_value(&self) -> Value { - Value::Null - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Null - } - - fn create_mutable_vector(&self, _capacity: usize) -> Box { - Box::new(NullVectorBuilder::default()) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/types/primitive_type.rs b/src/datatypes2/src/types/primitive_type.rs deleted file mode 100644 index e389ca13bf..0000000000 --- a/src/datatypes2/src/types/primitive_type.rs +++ /dev/null @@ -1,358 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; - -use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; -use common_time::{Date, DateTime}; -use num::NumCast; -use serde::{Deserialize, Serialize}; -use snafu::OptionExt; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; -use crate::type_id::LogicalTypeId; -use crate::types::{DateTimeType, DateType}; -use crate::value::{Value, ValueRef}; -use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; - -/// Data types that can be used as arrow's native type. -pub trait NativeType: ArrowNativeType + NumCast { - /// Largest numeric type this primitive type can be cast to. - type LargestType: NativeType; -} - -macro_rules! impl_native_type { - ($Type: ident, $LargestType: ident) => { - impl NativeType for $Type { - type LargestType = $LargestType; - } - }; -} - -impl_native_type!(u8, u64); -impl_native_type!(u16, u64); -impl_native_type!(u32, u64); -impl_native_type!(u64, u64); -impl_native_type!(i8, i64); -impl_native_type!(i16, i64); -impl_native_type!(i32, i64); -impl_native_type!(i64, i64); -impl_native_type!(f32, f64); -impl_native_type!(f64, f64); - -/// Represents the wrapper type that wraps a native type using the `newtype pattern`, -/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native -/// type `i32`. -pub trait WrapperType: - Copy - + Scalar - + PartialEq - + Into - + Into> - + Serialize - + Into -{ - /// Logical primitive type that this wrapper type belongs to. - type LogicalType: LogicalPrimitiveType; - /// The underlying native type. - type Native: NativeType; - - /// Convert native type into this wrapper type. - fn from_native(value: Self::Native) -> Self; - - /// Convert this wrapper type into native type. - fn into_native(self) -> Self::Native; -} - -/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. -pub trait LogicalPrimitiveType: 'static + Sized { - /// Arrow primitive type of this logical type. - type ArrowPrimitive: ArrowPrimitiveType; - /// Native (physical) type of this logical type. - type Native: NativeType; - /// Wrapper type that the vector returns. - type Wrapper: WrapperType - + for<'a> Scalar, RefType<'a> = Self::Wrapper> - + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; - - /// Construct the data type struct. - fn build_data_type() -> ConcreteDataType; - - /// Return the name of the type. - fn type_name() -> &'static str; - - /// Dynamic cast the vector to the concrete vector type. - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; - - /// Cast value ref to the primitive type. - fn cast_value_ref(value: ValueRef) -> Result>; -} - -/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered -/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that -/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. -#[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrdPrimitive(pub T); - -impl OrdPrimitive { - pub fn as_primitive(&self) -> T { - self.0 - } -} - -impl Eq for OrdPrimitive {} - -impl PartialOrd for OrdPrimitive { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for OrdPrimitive { - fn cmp(&self, other: &Self) -> Ordering { - Into::::into(self.0).cmp(&Into::::into(other.0)) - } -} - -impl From> for Value { - fn from(p: OrdPrimitive) -> Self { - p.0.into() - } -} - -macro_rules! impl_wrapper { - ($Type: ident, $LogicalType: ident) => { - impl WrapperType for $Type { - type LogicalType = $LogicalType; - type Native = $Type; - - fn from_native(value: Self::Native) -> Self { - value - } - - fn into_native(self) -> Self::Native { - self - } - } - }; -} - -impl_wrapper!(u8, UInt8Type); -impl_wrapper!(u16, UInt16Type); -impl_wrapper!(u32, UInt32Type); -impl_wrapper!(u64, UInt64Type); -impl_wrapper!(i8, Int8Type); -impl_wrapper!(i16, Int16Type); -impl_wrapper!(i32, Int32Type); -impl_wrapper!(i64, Int64Type); -impl_wrapper!(f32, Float32Type); -impl_wrapper!(f64, Float64Type); - -impl WrapperType for Date { - type LogicalType = DateType; - type Native = i32; - - fn from_native(value: i32) -> Self { - Date::new(value) - } - - fn into_native(self) -> i32 { - self.val() - } -} - -impl WrapperType for DateTime { - type LogicalType = DateTimeType; - type Native = i64; - - fn from_native(value: Self::Native) -> Self { - DateTime::new(value) - } - - fn into_native(self) -> Self::Native { - self.val() - } -} - -macro_rules! define_logical_primitive_type { - ($Native: ident, $TypeId: ident, $DataType: ident) => { - // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit - // `struct DataType;` to ensure the serialized JSON string is compatible with previous - // implementation. - #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] - pub struct $DataType {} - - impl LogicalPrimitiveType for $DataType { - type ArrowPrimitive = arrow::datatypes::$DataType; - type Native = $Native; - type Wrapper = $Native; - - fn build_data_type() -> ConcreteDataType { - ConcreteDataType::$TypeId($DataType::default()) - } - - fn type_name() -> &'static str { - stringify!($TypeId) - } - - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { - vector - .as_any() - .downcast_ref::>() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to cast {} to vector of primitive type {}", - vector.vector_type_name(), - stringify!($TypeId) - ), - }) - } - - fn cast_value_ref(value: ValueRef) -> Result> { - match value { - ValueRef::Null => Ok(None), - ValueRef::$TypeId(v) => Ok(Some(v.into())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value {:?} to primitive type {}", - other, - stringify!($TypeId), - ), - } - .fail(), - } - } - } - }; -} - -macro_rules! define_non_timestamp_primitive { - ($Native: ident, $TypeId: ident, $DataType: ident) => { - define_logical_primitive_type!($Native, $TypeId, $DataType); - - impl DataType for $DataType { - fn name(&self) -> &str { - stringify!($TypeId) - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::$TypeId - } - - fn default_value(&self) -> Value { - $Native::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::$TypeId - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } - } - }; -} - -define_non_timestamp_primitive!(u8, UInt8, UInt8Type); -define_non_timestamp_primitive!(u16, UInt16, UInt16Type); -define_non_timestamp_primitive!(u32, UInt32, UInt32Type); -define_non_timestamp_primitive!(u64, UInt64, UInt64Type); -define_non_timestamp_primitive!(i8, Int8, Int8Type); -define_non_timestamp_primitive!(i16, Int16, Int16Type); -define_non_timestamp_primitive!(i32, Int32, Int32Type); -define_non_timestamp_primitive!(f32, Float32, Float32Type); -define_non_timestamp_primitive!(f64, Float64, Float64Type); - -// Timestamp primitive: -define_logical_primitive_type!(i64, Int64, Int64Type); - -impl DataType for Int64Type { - fn name(&self) -> &str { - "Int64" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Int64 - } - - fn default_value(&self) -> Value { - Value::Int64(0) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Int64 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - true - } -} - -#[cfg(test)] -mod tests { - use std::collections::BinaryHeap; - - use super::*; - - #[test] - fn test_ord_primitive() { - struct Foo - where - T: WrapperType, - { - heap: BinaryHeap>, - } - - impl Foo - where - T: WrapperType, - { - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - self.heap.push(value); - } - } - - macro_rules! test { - ($Type:ident) => { - let mut foo = Foo::<$Type> { - heap: BinaryHeap::new(), - }; - foo.push($Type::default()); - }; - } - - test!(u8); - test!(u16); - test!(u32); - test!(u64); - test!(i8); - test!(i16); - test!(i32); - test!(i64); - test!(f32); - test!(f64); - } -} diff --git a/src/datatypes2/src/types/string_type.rs b/src/datatypes2/src/types/string_type.rs deleted file mode 100644 index 799cbbbdd3..0000000000 --- a/src/datatypes2/src/types/string_type.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::datatypes::DataType as ArrowDataType; -use common_base::bytes::StringBytes; -use serde::{Deserialize, Serialize}; - -use crate::data_type::{DataType, DataTypeRef}; -use crate::prelude::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; -use crate::vectors::{MutableVector, StringVectorBuilder}; - -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct StringType; - -impl StringType { - pub fn arc() -> DataTypeRef { - Arc::new(Self) - } -} - -impl DataType for StringType { - fn name(&self) -> &str { - "String" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::String - } - - fn default_value(&self) -> Value { - StringBytes::default().into() - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Utf8 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(StringVectorBuilder::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - false - } -} diff --git a/src/datatypes2/src/value.rs b/src/datatypes2/src/value.rs deleted file mode 100644 index bade88d419..0000000000 --- a/src/datatypes2/src/value.rs +++ /dev/null @@ -1,1275 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; -use std::fmt::{Display, Formatter}; - -use common_base::bytes::{Bytes, StringBytes}; -use common_time::date::Date; -use common_time::datetime::DateTime; -use common_time::timestamp::{TimeUnit, Timestamp}; -use datafusion_common::ScalarValue; -pub use ordered_float::OrderedFloat; -use serde::{Deserialize, Serialize}; - -use crate::error::{self, Result}; -use crate::prelude::*; -use crate::type_id::LogicalTypeId; -use crate::vectors::ListVector; - -pub type OrderedF32 = OrderedFloat; -pub type OrderedF64 = OrderedFloat; - -/// Value holds a single arbitrary value of any [DataType](crate::data_type::DataType). -/// -/// Comparison between values with different types (expect Null) is not allowed. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum Value { - Null, - - // Numeric types: - Boolean(bool), - UInt8(u8), - UInt16(u16), - UInt32(u32), - UInt64(u64), - Int8(i8), - Int16(i16), - Int32(i32), - Int64(i64), - Float32(OrderedF32), - Float64(OrderedF64), - - // String types: - String(StringBytes), - Binary(Bytes), - - // Date & Time types: - Date(Date), - DateTime(DateTime), - Timestamp(Timestamp), - - List(ListValue), -} - -impl Display for Value { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Value::Null => write!(f, "{}", self.data_type().name()), - Value::Boolean(v) => write!(f, "{}", v), - Value::UInt8(v) => write!(f, "{}", v), - Value::UInt16(v) => write!(f, "{}", v), - Value::UInt32(v) => write!(f, "{}", v), - Value::UInt64(v) => write!(f, "{}", v), - Value::Int8(v) => write!(f, "{}", v), - Value::Int16(v) => write!(f, "{}", v), - Value::Int32(v) => write!(f, "{}", v), - Value::Int64(v) => write!(f, "{}", v), - Value::Float32(v) => write!(f, "{}", v), - Value::Float64(v) => write!(f, "{}", v), - Value::String(v) => write!(f, "{}", v.as_utf8()), - Value::Binary(v) => { - let hex = v - .iter() - .map(|b| format!("{:02x}", b)) - .collect::>() - .join(""); - write!(f, "{}", hex) - } - Value::Date(v) => write!(f, "{}", v), - Value::DateTime(v) => write!(f, "{}", v), - Value::Timestamp(v) => write!(f, "{}", v.to_iso8601_string()), - Value::List(v) => { - let default = Box::new(vec![]); - let items = v.items().as_ref().unwrap_or(&default); - let items = items - .iter() - .map(|i| i.to_string()) - .collect::>() - .join(", "); - write!(f, "{}[{}]", v.datatype.name(), items) - } - } - } -} - -impl Value { - /// Returns data type of the value. - /// - /// # Panics - /// Panics if the data type is not supported. - pub fn data_type(&self) -> ConcreteDataType { - // TODO(yingwen): Implement this once all data types are implemented. - match self { - Value::Null => ConcreteDataType::null_datatype(), - Value::Boolean(_) => ConcreteDataType::boolean_datatype(), - Value::UInt8(_) => ConcreteDataType::uint8_datatype(), - Value::UInt16(_) => ConcreteDataType::uint16_datatype(), - Value::UInt32(_) => ConcreteDataType::uint32_datatype(), - Value::UInt64(_) => ConcreteDataType::uint64_datatype(), - Value::Int8(_) => ConcreteDataType::int8_datatype(), - Value::Int16(_) => ConcreteDataType::int16_datatype(), - Value::Int32(_) => ConcreteDataType::int32_datatype(), - Value::Int64(_) => ConcreteDataType::int64_datatype(), - Value::Float32(_) => ConcreteDataType::float32_datatype(), - Value::Float64(_) => ConcreteDataType::float64_datatype(), - Value::String(_) => ConcreteDataType::string_datatype(), - Value::Binary(_) => ConcreteDataType::binary_datatype(), - Value::Date(_) => ConcreteDataType::date_datatype(), - Value::DateTime(_) => ConcreteDataType::datetime_datatype(), - Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), - Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), - } - } - - /// Returns true if this is a null value. - pub fn is_null(&self) -> bool { - matches!(self, Value::Null) - } - - /// Cast itself to [ListValue]. - pub fn as_list(&self) -> Result> { - match self { - Value::Null => Ok(None), - Value::List(v) => Ok(Some(v)), - other => error::CastTypeSnafu { - msg: format!("Failed to cast {:?} to list value", other), - } - .fail(), - } - } - - /// Cast itself to [ValueRef]. - pub fn as_value_ref(&self) -> ValueRef { - match self { - Value::Null => ValueRef::Null, - Value::Boolean(v) => ValueRef::Boolean(*v), - Value::UInt8(v) => ValueRef::UInt8(*v), - Value::UInt16(v) => ValueRef::UInt16(*v), - Value::UInt32(v) => ValueRef::UInt32(*v), - Value::UInt64(v) => ValueRef::UInt64(*v), - Value::Int8(v) => ValueRef::Int8(*v), - Value::Int16(v) => ValueRef::Int16(*v), - Value::Int32(v) => ValueRef::Int32(*v), - Value::Int64(v) => ValueRef::Int64(*v), - Value::Float32(v) => ValueRef::Float32(*v), - Value::Float64(v) => ValueRef::Float64(*v), - Value::String(v) => ValueRef::String(v.as_utf8()), - Value::Binary(v) => ValueRef::Binary(v), - Value::Date(v) => ValueRef::Date(*v), - Value::DateTime(v) => ValueRef::DateTime(*v), - Value::List(v) => ValueRef::List(ListValueRef::Ref { val: v }), - Value::Timestamp(v) => ValueRef::Timestamp(*v), - } - } - - /// Returns the logical type of the value. - pub fn logical_type_id(&self) -> LogicalTypeId { - match self { - Value::Null => LogicalTypeId::Null, - Value::Boolean(_) => LogicalTypeId::Boolean, - Value::UInt8(_) => LogicalTypeId::UInt8, - Value::UInt16(_) => LogicalTypeId::UInt16, - Value::UInt32(_) => LogicalTypeId::UInt32, - Value::UInt64(_) => LogicalTypeId::UInt64, - Value::Int8(_) => LogicalTypeId::Int8, - Value::Int16(_) => LogicalTypeId::Int16, - Value::Int32(_) => LogicalTypeId::Int32, - Value::Int64(_) => LogicalTypeId::Int64, - Value::Float32(_) => LogicalTypeId::Float32, - Value::Float64(_) => LogicalTypeId::Float64, - Value::String(_) => LogicalTypeId::String, - Value::Binary(_) => LogicalTypeId::Binary, - Value::List(_) => LogicalTypeId::List, - Value::Date(_) => LogicalTypeId::Date, - Value::DateTime(_) => LogicalTypeId::DateTime, - Value::Timestamp(t) => match t.unit() { - TimeUnit::Second => LogicalTypeId::TimestampSecond, - TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, - TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, - TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, - }, - } - } -} - -macro_rules! impl_ord_for_value_like { - ($Type: ident, $left: ident, $right: ident) => { - if $left.is_null() && !$right.is_null() { - return Ordering::Less; - } else if !$left.is_null() && $right.is_null() { - return Ordering::Greater; - } else { - match ($left, $right) { - ($Type::Null, $Type::Null) => Ordering::Equal, - ($Type::Boolean(v1), $Type::Boolean(v2)) => v1.cmp(v2), - ($Type::UInt8(v1), $Type::UInt8(v2)) => v1.cmp(v2), - ($Type::UInt16(v1), $Type::UInt16(v2)) => v1.cmp(v2), - ($Type::UInt32(v1), $Type::UInt32(v2)) => v1.cmp(v2), - ($Type::UInt64(v1), $Type::UInt64(v2)) => v1.cmp(v2), - ($Type::Int8(v1), $Type::Int8(v2)) => v1.cmp(v2), - ($Type::Int16(v1), $Type::Int16(v2)) => v1.cmp(v2), - ($Type::Int32(v1), $Type::Int32(v2)) => v1.cmp(v2), - ($Type::Int64(v1), $Type::Int64(v2)) => v1.cmp(v2), - ($Type::Float32(v1), $Type::Float32(v2)) => v1.cmp(v2), - ($Type::Float64(v1), $Type::Float64(v2)) => v1.cmp(v2), - ($Type::String(v1), $Type::String(v2)) => v1.cmp(v2), - ($Type::Binary(v1), $Type::Binary(v2)) => v1.cmp(v2), - ($Type::Date(v1), $Type::Date(v2)) => v1.cmp(v2), - ($Type::DateTime(v1), $Type::DateTime(v2)) => v1.cmp(v2), - ($Type::Timestamp(v1), $Type::Timestamp(v2)) => v1.cmp(v2), - ($Type::List(v1), $Type::List(v2)) => v1.cmp(v2), - _ => panic!( - "Cannot compare different values {:?} and {:?}", - $left, $right - ), - } - } - }; -} - -impl PartialOrd for Value { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Value { - fn cmp(&self, other: &Self) -> Ordering { - impl_ord_for_value_like!(Value, self, other) - } -} - -macro_rules! impl_value_from { - ($Variant: ident, $Type: ident) => { - impl From<$Type> for Value { - fn from(value: $Type) -> Self { - Value::$Variant(value.into()) - } - } - - impl From> for Value { - fn from(value: Option<$Type>) -> Self { - match value { - Some(v) => Value::$Variant(v.into()), - None => Value::Null, - } - } - } - }; -} - -impl_value_from!(Boolean, bool); -impl_value_from!(UInt8, u8); -impl_value_from!(UInt16, u16); -impl_value_from!(UInt32, u32); -impl_value_from!(UInt64, u64); -impl_value_from!(Int8, i8); -impl_value_from!(Int16, i16); -impl_value_from!(Int32, i32); -impl_value_from!(Int64, i64); -impl_value_from!(Float32, f32); -impl_value_from!(Float64, f64); -impl_value_from!(String, StringBytes); -impl_value_from!(Binary, Bytes); -impl_value_from!(Date, Date); -impl_value_from!(DateTime, DateTime); -impl_value_from!(Timestamp, Timestamp); - -impl From for Value { - fn from(string: String) -> Value { - Value::String(string.into()) - } -} - -impl From<&str> for Value { - fn from(string: &str) -> Value { - Value::String(string.into()) - } -} - -impl From> for Value { - fn from(bytes: Vec) -> Value { - Value::Binary(bytes.into()) - } -} - -impl From<&[u8]> for Value { - fn from(bytes: &[u8]) -> Value { - Value::Binary(bytes.into()) - } -} - -impl TryFrom for serde_json::Value { - type Error = serde_json::Error; - - fn try_from(value: Value) -> serde_json::Result { - let json_value = match value { - Value::Null => serde_json::Value::Null, - Value::Boolean(v) => serde_json::Value::Bool(v), - Value::UInt8(v) => serde_json::Value::from(v), - Value::UInt16(v) => serde_json::Value::from(v), - Value::UInt32(v) => serde_json::Value::from(v), - Value::UInt64(v) => serde_json::Value::from(v), - Value::Int8(v) => serde_json::Value::from(v), - Value::Int16(v) => serde_json::Value::from(v), - Value::Int32(v) => serde_json::Value::from(v), - Value::Int64(v) => serde_json::Value::from(v), - Value::Float32(v) => serde_json::Value::from(v.0), - Value::Float64(v) => serde_json::Value::from(v.0), - Value::String(bytes) => serde_json::Value::String(bytes.as_utf8().to_string()), - Value::Binary(bytes) => serde_json::to_value(bytes)?, - Value::Date(v) => serde_json::Value::Number(v.val().into()), - Value::DateTime(v) => serde_json::Value::Number(v.val().into()), - Value::List(v) => serde_json::to_value(v)?, - Value::Timestamp(v) => serde_json::to_value(v.value())?, - }; - - Ok(json_value) - } -} - -// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. -/// List value. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ListValue { - /// List of nested Values (boxed to reduce size_of(Value)) - #[allow(clippy::box_collection)] - items: Option>>, - /// Inner values datatype, to distinguish empty lists of different datatypes. - /// Restricted by DataFusion, cannot use null datatype for empty list. - datatype: ConcreteDataType, -} - -impl Eq for ListValue {} - -impl ListValue { - pub fn new(items: Option>>, datatype: ConcreteDataType) -> Self { - Self { items, datatype } - } - - pub fn items(&self) -> &Option>> { - &self.items - } - - pub fn datatype(&self) -> &ConcreteDataType { - &self.datatype - } -} - -impl Default for ListValue { - fn default() -> ListValue { - ListValue::new(None, ConcreteDataType::null_datatype()) - } -} - -impl PartialOrd for ListValue { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for ListValue { - fn cmp(&self, other: &Self) -> Ordering { - assert_eq!( - self.datatype, other.datatype, - "Cannot compare different datatypes!" - ); - self.items.cmp(&other.items) - } -} - -impl TryFrom for Value { - type Error = error::Error; - - fn try_from(v: ScalarValue) -> Result { - let v = match v { - ScalarValue::Null => Value::Null, - ScalarValue::Boolean(b) => Value::from(b), - ScalarValue::Float32(f) => Value::from(f), - ScalarValue::Float64(f) => Value::from(f), - ScalarValue::Int8(i) => Value::from(i), - ScalarValue::Int16(i) => Value::from(i), - ScalarValue::Int32(i) => Value::from(i), - ScalarValue::Int64(i) => Value::from(i), - ScalarValue::UInt8(u) => Value::from(u), - ScalarValue::UInt16(u) => Value::from(u), - ScalarValue::UInt32(u) => Value::from(u), - ScalarValue::UInt64(u) => Value::from(u), - ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { - Value::from(s.map(StringBytes::from)) - } - ScalarValue::Binary(b) - | ScalarValue::LargeBinary(b) - | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), - ScalarValue::List(vs, field) => { - let items = if let Some(vs) = vs { - let vs = vs - .into_iter() - .map(ScalarValue::try_into) - .collect::>()?; - Some(Box::new(vs)) - } else { - None - }; - let datatype = ConcreteDataType::try_from(field.data_type())?; - Value::List(ListValue::new(items, datatype)) - } - ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), - ScalarValue::Date64(d) => d - .map(|x| Value::DateTime(DateTime::new(x))) - .unwrap_or(Value::Null), - ScalarValue::TimestampSecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Second))) - .unwrap_or(Value::Null), - ScalarValue::TimestampMillisecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Millisecond))) - .unwrap_or(Value::Null), - ScalarValue::TimestampMicrosecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Microsecond))) - .unwrap_or(Value::Null), - ScalarValue::TimestampNanosecond(t, _) => t - .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) - .unwrap_or(Value::Null), - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Time64(_) - | ScalarValue::IntervalYearMonth(_) - | ScalarValue::IntervalDayTime(_) - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Struct(_, _) - | ScalarValue::Dictionary(_, _) => { - return error::UnsupportedArrowTypeSnafu { - arrow_type: v.get_datatype(), - } - .fail() - } - }; - Ok(v) - } -} - -/// Reference to [Value]. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ValueRef<'a> { - Null, - - // Numeric types: - Boolean(bool), - UInt8(u8), - UInt16(u16), - UInt32(u32), - UInt64(u64), - Int8(i8), - Int16(i16), - Int32(i32), - Int64(i64), - Float32(OrderedF32), - Float64(OrderedF64), - - // String types: - String(&'a str), - Binary(&'a [u8]), - - // Date & Time types: - Date(Date), - DateTime(DateTime), - Timestamp(Timestamp), - List(ListValueRef<'a>), -} - -macro_rules! impl_as_for_value_ref { - ($value: ident, $Variant: ident) => { - match $value { - ValueRef::Null => Ok(None), - ValueRef::$Variant(v) => Ok(Some(*v)), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value ref {:?} to {}", - other, - stringify!($Variant) - ), - } - .fail(), - } - }; -} - -impl<'a> ValueRef<'a> { - /// Returns true if this is null. - pub fn is_null(&self) -> bool { - matches!(self, ValueRef::Null) - } - - /// Cast itself to binary slice. - pub fn as_binary(&self) -> Result> { - impl_as_for_value_ref!(self, Binary) - } - - /// Cast itself to string slice. - pub fn as_string(&self) -> Result> { - impl_as_for_value_ref!(self, String) - } - - /// Cast itself to boolean. - pub fn as_boolean(&self) -> Result> { - impl_as_for_value_ref!(self, Boolean) - } - - /// Cast itself to [Date]. - pub fn as_date(&self) -> Result> { - impl_as_for_value_ref!(self, Date) - } - - /// Cast itself to [DateTime]. - pub fn as_datetime(&self) -> Result> { - impl_as_for_value_ref!(self, DateTime) - } - - pub fn as_timestamp(&self) -> Result> { - impl_as_for_value_ref!(self, Timestamp) - } - - /// Cast itself to [ListValueRef]. - pub fn as_list(&self) -> Result> { - impl_as_for_value_ref!(self, List) - } -} - -impl<'a> PartialOrd for ValueRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'a> Ord for ValueRef<'a> { - fn cmp(&self, other: &Self) -> Ordering { - impl_ord_for_value_like!(ValueRef, self, other) - } -} - -macro_rules! impl_value_ref_from { - ($Variant:ident, $Type:ident) => { - impl From<$Type> for ValueRef<'_> { - fn from(value: $Type) -> Self { - ValueRef::$Variant(value.into()) - } - } - - impl From> for ValueRef<'_> { - fn from(value: Option<$Type>) -> Self { - match value { - Some(v) => ValueRef::$Variant(v.into()), - None => ValueRef::Null, - } - } - } - }; -} - -impl_value_ref_from!(Boolean, bool); -impl_value_ref_from!(UInt8, u8); -impl_value_ref_from!(UInt16, u16); -impl_value_ref_from!(UInt32, u32); -impl_value_ref_from!(UInt64, u64); -impl_value_ref_from!(Int8, i8); -impl_value_ref_from!(Int16, i16); -impl_value_ref_from!(Int32, i32); -impl_value_ref_from!(Int64, i64); -impl_value_ref_from!(Float32, f32); -impl_value_ref_from!(Float64, f64); -impl_value_ref_from!(Date, Date); -impl_value_ref_from!(DateTime, DateTime); -impl_value_ref_from!(Timestamp, Timestamp); - -impl<'a> From<&'a str> for ValueRef<'a> { - fn from(string: &'a str) -> ValueRef<'a> { - ValueRef::String(string) - } -} - -impl<'a> From<&'a [u8]> for ValueRef<'a> { - fn from(bytes: &'a [u8]) -> ValueRef<'a> { - ValueRef::Binary(bytes) - } -} - -impl<'a> From>> for ValueRef<'a> { - fn from(list: Option) -> ValueRef { - match list { - Some(v) => ValueRef::List(v), - None => ValueRef::Null, - } - } -} - -/// Reference to a [ListValue]. -/// -/// Now comparison still requires some allocation (call of `to_value()`) and -/// might be avoidable by downcasting and comparing the underlying array slice -/// if it becomes bottleneck. -#[derive(Debug, Clone, Copy)] -pub enum ListValueRef<'a> { - // TODO(yingwen): Consider replace this by VectorRef. - Indexed { vector: &'a ListVector, idx: usize }, - Ref { val: &'a ListValue }, -} - -impl<'a> ListValueRef<'a> { - /// Convert self to [Value]. This method would clone the underlying data. - fn to_value(self) -> Value { - match self { - ListValueRef::Indexed { vector, idx } => vector.get(idx), - ListValueRef::Ref { val } => Value::List(val.clone()), - } - } -} - -impl<'a> PartialEq for ListValueRef<'a> { - fn eq(&self, other: &Self) -> bool { - self.to_value().eq(&other.to_value()) - } -} - -impl<'a> Eq for ListValueRef<'a> {} - -impl<'a> Ord for ListValueRef<'a> { - fn cmp(&self, other: &Self) -> Ordering { - // Respect the order of `Value` by converting into value before comparison. - self.to_value().cmp(&other.to_value()) - } -} - -impl<'a> PartialOrd for ListValueRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use num_traits::Float; - - use super::*; - - #[test] - fn test_try_from_scalar_value() { - assert_eq!( - Value::Boolean(true), - ScalarValue::Boolean(Some(true)).try_into().unwrap() - ); - assert_eq!( - Value::Boolean(false), - ScalarValue::Boolean(Some(false)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Boolean(None).try_into().unwrap()); - - assert_eq!( - Value::Float32(1.0f32.into()), - ScalarValue::Float32(Some(1.0f32)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Float32(None).try_into().unwrap()); - - assert_eq!( - Value::Float64(2.0f64.into()), - ScalarValue::Float64(Some(2.0f64)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Float64(None).try_into().unwrap()); - - assert_eq!( - Value::Int8(i8::MAX), - ScalarValue::Int8(Some(i8::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int8(None).try_into().unwrap()); - - assert_eq!( - Value::Int16(i16::MAX), - ScalarValue::Int16(Some(i16::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int16(None).try_into().unwrap()); - - assert_eq!( - Value::Int32(i32::MAX), - ScalarValue::Int32(Some(i32::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int32(None).try_into().unwrap()); - - assert_eq!( - Value::Int64(i64::MAX), - ScalarValue::Int64(Some(i64::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Int64(None).try_into().unwrap()); - - assert_eq!( - Value::UInt8(u8::MAX), - ScalarValue::UInt8(Some(u8::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt8(None).try_into().unwrap()); - - assert_eq!( - Value::UInt16(u16::MAX), - ScalarValue::UInt16(Some(u16::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt16(None).try_into().unwrap()); - - assert_eq!( - Value::UInt32(u32::MAX), - ScalarValue::UInt32(Some(u32::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt32(None).try_into().unwrap()); - - assert_eq!( - Value::UInt64(u64::MAX), - ScalarValue::UInt64(Some(u64::MAX)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::UInt64(None).try_into().unwrap()); - - assert_eq!( - Value::from("hello"), - ScalarValue::Utf8(Some("hello".to_string())) - .try_into() - .unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Utf8(None).try_into().unwrap()); - - assert_eq!( - Value::from("large_hello"), - ScalarValue::LargeUtf8(Some("large_hello".to_string())) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::LargeUtf8(None).try_into().unwrap() - ); - - assert_eq!( - Value::from("world".as_bytes()), - ScalarValue::Binary(Some("world".as_bytes().to_vec())) - .try_into() - .unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Binary(None).try_into().unwrap()); - - assert_eq!( - Value::from("large_world".as_bytes()), - ScalarValue::LargeBinary(Some("large_world".as_bytes().to_vec())) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::LargeBinary(None).try_into().unwrap() - ); - - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![Value::Int32(1), Value::Null])), - ConcreteDataType::int32_datatype() - )), - ScalarValue::new_list( - Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), - ArrowDataType::Int32, - ) - .try_into() - .unwrap() - ); - assert_eq!( - Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), - ScalarValue::new_list(None, ArrowDataType::UInt32) - .try_into() - .unwrap() - ); - - assert_eq!( - Value::Date(Date::new(123)), - ScalarValue::Date32(Some(123)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Date32(None).try_into().unwrap()); - - assert_eq!( - Value::DateTime(DateTime::new(456)), - ScalarValue::Date64(Some(456)).try_into().unwrap() - ); - assert_eq!(Value::Null, ScalarValue::Date64(None).try_into().unwrap()); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Second)), - ScalarValue::TimestampSecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampSecond(None, None).try_into().unwrap() - ); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), - ScalarValue::TimestampMillisecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampMillisecond(None, None) - .try_into() - .unwrap() - ); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Microsecond)), - ScalarValue::TimestampMicrosecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampMicrosecond(None, None) - .try_into() - .unwrap() - ); - - assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), - ScalarValue::TimestampNanosecond(Some(1), None) - .try_into() - .unwrap() - ); - assert_eq!( - Value::Null, - ScalarValue::TimestampNanosecond(None, None) - .try_into() - .unwrap() - ); - - let result: Result = ScalarValue::Decimal128(Some(1), 0, 0).try_into(); - result - .unwrap_err() - .to_string() - .contains("Unsupported arrow data type, type: Decimal(0, 0)"); - } - - #[test] - fn test_value_from_inner() { - assert_eq!(Value::Boolean(true), Value::from(true)); - assert_eq!(Value::Boolean(false), Value::from(false)); - - assert_eq!(Value::UInt8(u8::MIN), Value::from(u8::MIN)); - assert_eq!(Value::UInt8(u8::MAX), Value::from(u8::MAX)); - - assert_eq!(Value::UInt16(u16::MIN), Value::from(u16::MIN)); - assert_eq!(Value::UInt16(u16::MAX), Value::from(u16::MAX)); - - assert_eq!(Value::UInt32(u32::MIN), Value::from(u32::MIN)); - assert_eq!(Value::UInt32(u32::MAX), Value::from(u32::MAX)); - - assert_eq!(Value::UInt64(u64::MIN), Value::from(u64::MIN)); - assert_eq!(Value::UInt64(u64::MAX), Value::from(u64::MAX)); - - assert_eq!(Value::Int8(i8::MIN), Value::from(i8::MIN)); - assert_eq!(Value::Int8(i8::MAX), Value::from(i8::MAX)); - - assert_eq!(Value::Int16(i16::MIN), Value::from(i16::MIN)); - assert_eq!(Value::Int16(i16::MAX), Value::from(i16::MAX)); - - assert_eq!(Value::Int32(i32::MIN), Value::from(i32::MIN)); - assert_eq!(Value::Int32(i32::MAX), Value::from(i32::MAX)); - - assert_eq!(Value::Int64(i64::MIN), Value::from(i64::MIN)); - assert_eq!(Value::Int64(i64::MAX), Value::from(i64::MAX)); - - assert_eq!( - Value::Float32(OrderedFloat(f32::MIN)), - Value::from(f32::MIN) - ); - assert_eq!( - Value::Float32(OrderedFloat(f32::MAX)), - Value::from(f32::MAX) - ); - - assert_eq!( - Value::Float64(OrderedFloat(f64::MIN)), - Value::from(f64::MIN) - ); - assert_eq!( - Value::Float64(OrderedFloat(f64::MAX)), - Value::from(f64::MAX) - ); - - let string_bytes = StringBytes::from("hello"); - assert_eq!( - Value::String(string_bytes.clone()), - Value::from(string_bytes) - ); - - let bytes = Bytes::from(b"world".as_slice()); - assert_eq!(Value::Binary(bytes.clone()), Value::from(bytes)); - } - - fn check_type_and_value(data_type: &ConcreteDataType, value: &Value) { - assert_eq!(*data_type, value.data_type()); - assert_eq!(data_type.logical_type_id(), value.logical_type_id()); - } - - #[test] - fn test_value_datatype() { - check_type_and_value(&ConcreteDataType::boolean_datatype(), &Value::Boolean(true)); - check_type_and_value(&ConcreteDataType::uint8_datatype(), &Value::UInt8(u8::MIN)); - check_type_and_value( - &ConcreteDataType::uint16_datatype(), - &Value::UInt16(u16::MIN), - ); - check_type_and_value( - &ConcreteDataType::uint16_datatype(), - &Value::UInt16(u16::MAX), - ); - check_type_and_value( - &ConcreteDataType::uint32_datatype(), - &Value::UInt32(u32::MIN), - ); - check_type_and_value( - &ConcreteDataType::uint64_datatype(), - &Value::UInt64(u64::MIN), - ); - check_type_and_value(&ConcreteDataType::int8_datatype(), &Value::Int8(i8::MIN)); - check_type_and_value(&ConcreteDataType::int16_datatype(), &Value::Int16(i16::MIN)); - check_type_and_value(&ConcreteDataType::int32_datatype(), &Value::Int32(i32::MIN)); - check_type_and_value(&ConcreteDataType::int64_datatype(), &Value::Int64(i64::MIN)); - check_type_and_value( - &ConcreteDataType::float32_datatype(), - &Value::Float32(OrderedFloat(f32::MIN)), - ); - check_type_and_value( - &ConcreteDataType::float64_datatype(), - &Value::Float64(OrderedFloat(f64::MIN)), - ); - check_type_and_value( - &ConcreteDataType::string_datatype(), - &Value::String(StringBytes::from("hello")), - ); - check_type_and_value( - &ConcreteDataType::binary_datatype(), - &Value::Binary(Bytes::from(b"world".as_slice())), - ); - check_type_and_value( - &ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - &Value::List(ListValue::new( - Some(Box::new(vec![Value::Int32(10)])), - ConcreteDataType::int32_datatype(), - )), - ); - check_type_and_value( - &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), - &Value::List(ListValue::default()), - ); - check_type_and_value( - &ConcreteDataType::date_datatype(), - &Value::Date(Date::new(1)), - ); - check_type_and_value( - &ConcreteDataType::datetime_datatype(), - &Value::DateTime(DateTime::new(1)), - ); - check_type_and_value( - &ConcreteDataType::timestamp_millisecond_datatype(), - &Value::Timestamp(Timestamp::from_millis(1)), - ); - } - - #[test] - fn test_value_from_string() { - let hello = "hello".to_string(); - assert_eq!( - Value::String(StringBytes::from(hello.clone())), - Value::from(hello) - ); - - let world = "world"; - assert_eq!(Value::String(StringBytes::from(world)), Value::from(world)); - } - - #[test] - fn test_value_from_bytes() { - let hello = b"hello".to_vec(); - assert_eq!( - Value::Binary(Bytes::from(hello.clone())), - Value::from(hello) - ); - - let world: &[u8] = b"world"; - assert_eq!(Value::Binary(Bytes::from(world)), Value::from(world)); - } - - fn to_json(value: Value) -> serde_json::Value { - value.try_into().unwrap() - } - - #[test] - fn test_to_json_value() { - assert_eq!(serde_json::Value::Null, to_json(Value::Null)); - assert_eq!(serde_json::Value::Bool(true), to_json(Value::Boolean(true))); - assert_eq!( - serde_json::Value::Number(20u8.into()), - to_json(Value::UInt8(20)) - ); - assert_eq!( - serde_json::Value::Number(20i8.into()), - to_json(Value::Int8(20)) - ); - assert_eq!( - serde_json::Value::Number(2000u16.into()), - to_json(Value::UInt16(2000)) - ); - assert_eq!( - serde_json::Value::Number(2000i16.into()), - to_json(Value::Int16(2000)) - ); - assert_eq!( - serde_json::Value::Number(3000u32.into()), - to_json(Value::UInt32(3000)) - ); - assert_eq!( - serde_json::Value::Number(3000i32.into()), - to_json(Value::Int32(3000)) - ); - assert_eq!( - serde_json::Value::Number(4000u64.into()), - to_json(Value::UInt64(4000)) - ); - assert_eq!( - serde_json::Value::Number(4000i64.into()), - to_json(Value::Int64(4000)) - ); - assert_eq!( - serde_json::Value::from(125.0f32), - to_json(Value::Float32(125.0.into())) - ); - assert_eq!( - serde_json::Value::from(125.0f64), - to_json(Value::Float64(125.0.into())) - ); - assert_eq!( - serde_json::Value::String(String::from("hello")), - to_json(Value::String(StringBytes::from("hello"))) - ); - assert_eq!( - serde_json::Value::from(b"world".as_slice()), - to_json(Value::Binary(Bytes::from(b"world".as_slice()))) - ); - assert_eq!( - serde_json::Value::Number(5000i32.into()), - to_json(Value::Date(Date::new(5000))) - ); - assert_eq!( - serde_json::Value::Number(5000i64.into()), - to_json(Value::DateTime(DateTime::new(5000))) - ); - - assert_eq!( - serde_json::Value::Number(1.into()), - to_json(Value::Timestamp(Timestamp::from_millis(1))) - ); - - let json_value: serde_json::Value = - serde_json::from_str(r#"{"items":[{"Int32":123}],"datatype":{"Int32":{}}}"#).unwrap(); - assert_eq!( - json_value, - to_json(Value::List(ListValue { - items: Some(Box::new(vec![Value::Int32(123)])), - datatype: ConcreteDataType::int32_datatype(), - })) - ); - } - - #[test] - fn test_null_value() { - assert!(Value::Null.is_null()); - assert!(!Value::Boolean(true).is_null()); - assert!(Value::Null < Value::Boolean(false)); - assert!(Value::Boolean(true) > Value::Null); - assert!(Value::Null < Value::Int32(10)); - assert!(Value::Int32(10) > Value::Null); - } - - #[test] - fn test_null_value_ref() { - assert!(ValueRef::Null.is_null()); - assert!(!ValueRef::Boolean(true).is_null()); - assert!(ValueRef::Null < ValueRef::Boolean(false)); - assert!(ValueRef::Boolean(true) > ValueRef::Null); - assert!(ValueRef::Null < ValueRef::Int32(10)); - assert!(ValueRef::Int32(10) > ValueRef::Null); - } - - #[test] - fn test_as_value_ref() { - macro_rules! check_as_value_ref { - ($Variant: ident, $data: expr) => { - let value = Value::$Variant($data); - let value_ref = value.as_value_ref(); - let expect_ref = ValueRef::$Variant($data); - - assert_eq!(expect_ref, value_ref); - }; - } - - assert_eq!(ValueRef::Null, Value::Null.as_value_ref()); - check_as_value_ref!(Boolean, true); - check_as_value_ref!(UInt8, 123); - check_as_value_ref!(UInt16, 123); - check_as_value_ref!(UInt32, 123); - check_as_value_ref!(UInt64, 123); - check_as_value_ref!(Int8, -12); - check_as_value_ref!(Int16, -12); - check_as_value_ref!(Int32, -12); - check_as_value_ref!(Int64, -12); - check_as_value_ref!(Float32, OrderedF32::from(16.0)); - check_as_value_ref!(Float64, OrderedF64::from(16.0)); - check_as_value_ref!(Timestamp, Timestamp::from_millis(1)); - - assert_eq!( - ValueRef::String("hello"), - Value::String("hello".into()).as_value_ref() - ); - assert_eq!( - ValueRef::Binary(b"hello"), - Value::Binary("hello".as_bytes().into()).as_value_ref() - ); - - check_as_value_ref!(Date, Date::new(103)); - check_as_value_ref!(DateTime, DateTime::new(1034)); - - let list = ListValue { - items: None, - datatype: ConcreteDataType::int32_datatype(), - }; - assert_eq!( - ValueRef::List(ListValueRef::Ref { val: &list }), - Value::List(list.clone()).as_value_ref() - ); - } - - #[test] - fn test_value_ref_as() { - macro_rules! check_as_null { - ($method: ident) => { - assert_eq!(None, ValueRef::Null.$method().unwrap()); - }; - } - - check_as_null!(as_binary); - check_as_null!(as_string); - check_as_null!(as_boolean); - check_as_null!(as_date); - check_as_null!(as_datetime); - check_as_null!(as_list); - - macro_rules! check_as_correct { - ($data: expr, $Variant: ident, $method: ident) => { - assert_eq!(Some($data), ValueRef::$Variant($data).$method().unwrap()); - }; - } - - check_as_correct!("hello", String, as_string); - check_as_correct!("hello".as_bytes(), Binary, as_binary); - check_as_correct!(true, Boolean, as_boolean); - check_as_correct!(Date::new(123), Date, as_date); - check_as_correct!(DateTime::new(12), DateTime, as_datetime); - let list = ListValue { - items: None, - datatype: ConcreteDataType::int32_datatype(), - }; - check_as_correct!(ListValueRef::Ref { val: &list }, List, as_list); - - let wrong_value = ValueRef::Int32(12345); - assert!(wrong_value.as_binary().is_err()); - assert!(wrong_value.as_string().is_err()); - assert!(wrong_value.as_boolean().is_err()); - assert!(wrong_value.as_date().is_err()); - assert!(wrong_value.as_datetime().is_err()); - assert!(wrong_value.as_list().is_err()); - } - - #[test] - fn test_display() { - assert_eq!(Value::Null.to_string(), "Null"); - assert_eq!(Value::UInt8(8).to_string(), "8"); - assert_eq!(Value::UInt16(16).to_string(), "16"); - assert_eq!(Value::UInt32(32).to_string(), "32"); - assert_eq!(Value::UInt64(64).to_string(), "64"); - assert_eq!(Value::Int8(-8).to_string(), "-8"); - assert_eq!(Value::Int16(-16).to_string(), "-16"); - assert_eq!(Value::Int32(-32).to_string(), "-32"); - assert_eq!(Value::Int64(-64).to_string(), "-64"); - assert_eq!(Value::Float32((-32.123).into()).to_string(), "-32.123"); - assert_eq!(Value::Float64((-64.123).into()).to_string(), "-64.123"); - assert_eq!(Value::Float64(OrderedF64::infinity()).to_string(), "inf"); - assert_eq!(Value::Float64(OrderedF64::nan()).to_string(), "NaN"); - assert_eq!(Value::String(StringBytes::from("123")).to_string(), "123"); - assert_eq!( - Value::Binary(Bytes::from(vec![1, 2, 3])).to_string(), - "010203" - ); - assert_eq!(Value::Date(Date::new(0)).to_string(), "1970-01-01"); - assert_eq!( - Value::DateTime(DateTime::new(0)).to_string(), - "1970-01-01 00:00:00" - ); - assert_eq!( - Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)).to_string(), - "1970-01-01 00:00:01+0000" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![Value::Int8(1), Value::Int8(2)])), - ConcreteDataType::int8_datatype(), - )) - .to_string(), - "Int8[1, 2]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_second_datatype(), - )) - .to_string(), - "TimestampSecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_millisecond_datatype(), - )) - .to_string(), - "TimestampMillisecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_microsecond_datatype(), - )) - .to_string(), - "TimestampMicrosecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_nanosecond_datatype(), - )) - .to_string(), - "TimestampNanosecondType[]" - ); - } -} diff --git a/src/datatypes2/src/vectors.rs b/src/datatypes2/src/vectors.rs deleted file mode 100644 index 38fa762d4b..0000000000 --- a/src/datatypes2/src/vectors.rs +++ /dev/null @@ -1,309 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt::Debug; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef}; -use snafu::ensure; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::operations::VectorOp; - -mod binary; -mod boolean; -mod constant; -mod date; -mod datetime; -mod eq; -mod helper; -mod list; -mod null; -mod operations; -mod primitive; -mod string; -mod timestamp; -mod validity; - -pub use binary::{BinaryVector, BinaryVectorBuilder}; -pub use boolean::{BooleanVector, BooleanVectorBuilder}; -pub use constant::ConstantVector; -pub use date::{DateVector, DateVectorBuilder}; -pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; -pub use helper::Helper; -pub use list::{ListIter, ListVector, ListVectorBuilder}; -pub use null::{NullVector, NullVectorBuilder}; -pub use primitive::{ - Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, - Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, - Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, - UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, - UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, -}; -pub use string::{StringVector, StringVectorBuilder}; -pub use timestamp::{ - TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, - TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, - TimestampSecondVector, TimestampSecondVectorBuilder, -}; -pub use validity::Validity; - -// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify -// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. -/// Vector of data values. -pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { - /// Returns the data type of the vector. - /// - /// This may require heap allocation. - fn data_type(&self) -> ConcreteDataType; - - fn vector_type_name(&self) -> String; - - /// Returns the vector as [Any](std::any::Any) so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - - /// Returns number of elements in the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert this vector to a new arrow [ArrayRef]. - fn to_arrow_array(&self) -> ArrayRef; - - /// Convert this vector to a new boxed arrow [Array]. - fn to_boxed_arrow_array(&self) -> Box; - - /// Returns the validity of the Array. - fn validity(&self) -> Validity; - - /// Returns the memory size of vector. - fn memory_size(&self) -> usize; - - /// The number of null slots on this [`Vector`]. - /// # Implementation - /// This is `O(1)`. - fn null_count(&self) -> usize; - - /// Returns true when it's a ConstantColumn - fn is_const(&self) -> bool { - false - } - - /// Returns whether row is null. - fn is_null(&self, row: usize) -> bool; - - /// If the only value vector can contain is NULL. - fn only_null(&self) -> bool { - self.null_count() == self.len() - } - - /// Slices the `Vector`, returning a new `VectorRef`. - /// - /// # Panics - /// This function panics if `offset + length > self.len()`. - fn slice(&self, offset: usize, length: usize) -> VectorRef; - - /// Returns the clone of value at `index`. - /// - /// # Panics - /// Panic if `index` is out of bound. - fn get(&self, index: usize) -> Value; - - /// Returns the clone of value at `index` or error if `index` - /// is out of bound. - fn try_get(&self, index: usize) -> Result { - ensure!( - index < self.len(), - error::BadArrayAccessSnafu { - index, - size: self.len() - } - ); - Ok(self.get(index)) - } - - /// Returns the reference of value at `index`. - /// - /// # Panics - /// Panic if `index` is out of bound. - fn get_ref(&self, index: usize) -> ValueRef; -} - -pub type VectorRef = Arc; - -/// Mutable vector that could be used to build an immutable vector. -pub trait MutableVector: Send + Sync { - /// Returns the data type of the vector. - fn data_type(&self) -> ConcreteDataType; - - /// Returns the length of the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert to Any, to enable dynamic casting. - fn as_any(&self) -> &dyn Any; - - /// Convert to mutable Any, to enable dynamic casting. - fn as_mut_any(&mut self) -> &mut dyn Any; - - /// Convert `self` to an (immutable) [VectorRef] and reset `self`. - fn to_vector(&mut self) -> VectorRef; - - /// Push value ref to this mutable vector. - /// - /// Returns error if data type unmatch. - fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; - - /// Extend this mutable vector by slice of `vector`. - /// - /// Returns error if data type unmatch. - /// - /// # Panics - /// Panics if `offset + length > vector.len()`. - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; -} - -/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. -macro_rules! impl_try_from_arrow_array_for_vector { - ($Array: ident, $Vector: ident) => { - impl $Vector { - pub fn try_from_arrow_array( - array: impl AsRef, - ) -> crate::error::Result<$Vector> { - use snafu::OptionExt; - - let data = array - .as_ref() - .as_any() - .downcast_ref::<$Array>() - .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.as_ref().data_type()), - })? - .data() - .clone(); - - let concrete_array = $Array::from(data); - Ok($Vector::from(concrete_array)) - } - } - }; -} - -macro_rules! impl_validity_for_vector { - ($array: expr) => { - Validity::from_array_data($array.data()) - }; -} - -macro_rules! impl_get_for_vector { - ($array: expr, $index: ident) => { - if $array.is_valid($index) { - // Safety: The index have been checked by `is_valid()`. - unsafe { $array.value_unchecked($index).into() } - } else { - Value::Null - } - }; -} - -macro_rules! impl_get_ref_for_vector { - ($array: expr, $index: ident) => { - if $array.is_valid($index) { - // Safety: The index have been checked by `is_valid()`. - unsafe { $array.value_unchecked($index).into() } - } else { - ValueRef::Null - } - }; -} - -macro_rules! impl_extend_for_builder { - ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ - use snafu::OptionExt; - - let sliced_vector = $vector.slice($offset, $length); - let concrete_vector = sliced_vector - .as_any() - .downcast_ref::<$VectorType>() - .with_context(|| crate::error::CastTypeSnafu { - msg: format!( - "Failed to cast vector from {} to {}", - $vector.vector_type_name(), - stringify!($VectorType) - ), - })?; - for value in concrete_vector.iter_data() { - $mutable_vector.push(value); - } - Ok(()) - }}; -} - -pub(crate) use { - impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector, - impl_try_from_arrow_array_for_vector, impl_validity_for_vector, -}; - -#[cfg(test)] -pub mod tests { - use arrow::array::{Array, Int32Array, UInt8Array}; - use serde_json; - - use super::*; - use crate::data_type::DataType; - use crate::types::{Int32Type, LogicalPrimitiveType}; - use crate::vectors::helper::Helper; - - #[test] - fn test_df_columns_to_vector() { - let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); - let vector = Helper::try_into_vector(df_column).unwrap(); - assert_eq!( - Int32Type::build_data_type().as_arrow_type(), - vector.data_type().as_arrow_type() - ); - } - - #[test] - fn test_serialize_i32_vector() { - let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); - let json_value = Helper::try_into_vector(df_column) - .unwrap() - .serialize_to_json() - .unwrap(); - assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); - } - - #[test] - fn test_serialize_i8_vector() { - let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); - let json_value = Helper::try_into_vector(df_column) - .unwrap() - .serialize_to_json() - .unwrap(); - assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); - } -} diff --git a/src/datatypes2/src/vectors/binary.rs b/src/datatypes2/src/vectors/binary.rs deleted file mode 100644 index 3b5defc8ec..0000000000 --- a/src/datatypes2/src/vectors/binary.rs +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; -use snafu::ResultExt; - -use crate::arrow_array::{BinaryArray, MutableBinaryArray}; -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of binary strings. -#[derive(Debug, PartialEq)] -pub struct BinaryVector { - array: BinaryArray, -} - -impl BinaryVector { - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> BinaryVector { - BinaryVector { - array: BinaryArray::from(data), - } - } -} - -impl From for BinaryVector { - fn from(array: BinaryArray) -> Self { - Self { array } - } -} - -impl From>>> for BinaryVector { - fn from(data: Vec>>) -> Self { - Self { - array: BinaryArray::from_iter(data), - } - } -} - -impl Vector for BinaryVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::binary_datatype() - } - - fn vector_type_name(&self) -> String { - "BinaryVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(BinaryArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(BinaryArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) - } - - fn get_ref(&self, index: usize) -> ValueRef { - vectors::impl_get_ref_for_vector!(self.array, index) - } -} - -impl ScalarVector for BinaryVector { - type OwnedItem = Vec; - type RefItem<'a> = &'a [u8]; - type Iter<'a> = ArrayIter<&'a BinaryArray>; - type Builder = BinaryVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(self.array.value(idx)) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - self.array.iter() - } -} - -pub struct BinaryVectorBuilder { - mutable_array: MutableBinaryArray, -} - -impl MutableVector for BinaryVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::binary_datatype() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_binary()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) - } -} - -impl ScalarVectorBuilder for BinaryVectorBuilder { - type VectorType = BinaryVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: MutableBinaryArray::with_capacity(capacity, 0), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - } - - fn finish(&mut self) -> Self::VectorType { - BinaryVector { - array: self.mutable_array.finish(), - } - } -} - -impl Serializable for BinaryVector { - fn serialize_to_json(&self) -> Result> { - self.iter_data() - .map(|v| match v { - None => Ok(serde_json::Value::Null), // if binary vector not present, map to NULL - Some(vec) => serde_json::to_value(vec), - }) - .collect::>() - .context(error::SerializeSnafu) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector); - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use common_base::bytes::Bytes; - use serde_json; - - use super::*; - use crate::arrow_array::BinaryArray; - use crate::data_type::DataType; - use crate::serialize::Serializable; - use crate::types::BinaryType; - - #[test] - fn test_binary_vector_misc() { - let v = BinaryVector::from(BinaryArray::from_iter_values(&[ - vec![1, 2, 3], - vec![1, 2, 3], - ])); - - assert_eq!(2, v.len()); - assert_eq!("BinaryVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(128, v.memory_size()); - - for i in 0..2 { - assert!(!v.is_null(i)); - assert_eq!(Value::Binary(Bytes::from(vec![1, 2, 3])), v.get(i)); - assert_eq!(ValueRef::Binary(&[1, 2, 3]), v.get_ref(i)); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(2, arrow_arr.len()); - assert_eq!(&ArrowDataType::LargeBinary, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ - vec![1, 2, 3], - vec![1, 2, 3], - ])); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[[1,2,3],[1,2,3]]", - serde_json::to_string(&json_value).unwrap() - ); - } - - #[test] - fn test_serialize_binary_vector_with_null_to_json() { - let mut builder = BinaryVectorBuilder::with_capacity(4); - builder.push(Some(&[1, 2, 3])); - builder.push(None); - builder.push(Some(&[4, 5, 6])); - let vector = builder.finish(); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[[1,2,3],null,[4,5,6]]", - serde_json::to_string(&json_value).unwrap() - ); - } - - #[test] - fn test_from_arrow_array() { - let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); - let original = BinaryArray::from(arrow_array.data().clone()); - let vector = BinaryVector::from(arrow_array); - assert_eq!(original, vector.array); - } - - #[test] - fn test_binary_vector_build_get() { - let mut builder = BinaryVectorBuilder::with_capacity(4); - builder.push(Some(b"hello")); - builder.push(Some(b"happy")); - builder.push(Some(b"world")); - builder.push(None); - - let vector = builder.finish(); - assert_eq!(b"hello", vector.get_data(0).unwrap()); - assert_eq!(None, vector.get_data(3)); - - assert_eq!(Value::Binary(b"hello".as_slice().into()), vector.get(0)); - assert_eq!(Value::Null, vector.get(3)); - - let mut iter = vector.iter_data(); - assert_eq!(b"hello", iter.next().unwrap().unwrap()); - assert_eq!(b"happy", iter.next().unwrap().unwrap()); - assert_eq!(b"world", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!(None, iter.next()); - } - - #[test] - fn test_binary_vector_validity() { - let mut builder = BinaryVectorBuilder::with_capacity(4); - builder.push(Some(b"hello")); - builder.push(Some(b"world")); - let vector = builder.finish(); - assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); - - let mut builder = BinaryVectorBuilder::with_capacity(3); - builder.push(Some(b"hello")); - builder.push(None); - builder.push(Some(b"world")); - let vector = builder.finish(); - assert_eq!(1, vector.null_count()); - let validity = vector.validity(); - assert!(!validity.is_set(1)); - - assert_eq!(1, validity.null_count()); - assert!(!validity.is_set(1)); - } - - #[test] - fn test_binary_vector_builder() { - let input = BinaryVector::from_slice(&[b"world", b"one", b"two"]); - - let mut builder = BinaryType::default().create_mutable_vector(3); - builder - .push_value_ref(ValueRef::Binary("hello".as_bytes())) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(BinaryVector::from_slice(&[b"hello", b"one", b"two"])); - assert_eq!(expect, vector); - } -} diff --git a/src/datatypes2/src/vectors/boolean.rs b/src/datatypes2/src/vectors/boolean.rs deleted file mode 100644 index 2b4e5b8e10..0000000000 --- a/src/datatypes2/src/vectors/boolean.rs +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::borrow::Borrow; -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, -}; -use snafu::ResultExt; - -use crate::data_type::ConcreteDataType; -use crate::error::Result; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of boolean. -#[derive(Debug, PartialEq)] -pub struct BooleanVector { - array: BooleanArray, -} - -impl BooleanVector { - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - pub(crate) fn as_boolean_array(&self) -> &BooleanArray { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> BooleanVector { - BooleanVector { - array: BooleanArray::from(data), - } - } - - pub(crate) fn false_count(&self) -> usize { - self.array.false_count() - } -} - -impl From> for BooleanVector { - fn from(data: Vec) -> Self { - BooleanVector { - array: BooleanArray::from(data), - } - } -} - -impl From for BooleanVector { - fn from(array: BooleanArray) -> Self { - Self { array } - } -} - -impl From>> for BooleanVector { - fn from(data: Vec>) -> Self { - BooleanVector { - array: BooleanArray::from(data), - } - } -} - -impl>> FromIterator for BooleanVector { - fn from_iter>(iter: I) -> Self { - BooleanVector { - array: BooleanArray::from_iter(iter), - } - } -} - -impl Vector for BooleanVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::boolean_datatype() - } - - fn vector_type_name(&self) -> String { - "BooleanVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(BooleanArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(BooleanArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) - } - - fn get_ref(&self, index: usize) -> ValueRef { - vectors::impl_get_ref_for_vector!(self.array, index) - } -} - -impl ScalarVector for BooleanVector { - type OwnedItem = bool; - type RefItem<'a> = bool; - type Iter<'a> = ArrayIter<&'a BooleanArray>; - type Builder = BooleanVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(self.array.value(idx)) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - self.array.iter() - } -} - -pub struct BooleanVectorBuilder { - mutable_array: BooleanBuilder, -} - -impl MutableVector for BooleanVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::boolean_datatype() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_boolean()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) - } -} - -impl ScalarVectorBuilder for BooleanVectorBuilder { - type VectorType = BooleanVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: BooleanBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - } - - fn finish(&mut self) -> Self::VectorType { - BooleanVector { - array: self.mutable_array.finish(), - } - } -} - -impl Serializable for BooleanVector { - fn serialize_to_json(&self) -> Result> { - self.iter_data() - .map(serde_json::to_value) - .collect::>() - .context(crate::error::SerializeSnafu) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(BooleanArray, BooleanVector); - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; - - use super::*; - use crate::data_type::DataType; - use crate::serialize::Serializable; - use crate::types::BooleanType; - - #[test] - fn test_boolean_vector_misc() { - let bools = vec![true, false, true, true, false, false, true, true, false]; - let v = BooleanVector::from(bools.clone()); - assert_eq!(9, v.len()); - assert_eq!("BooleanVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(64, v.memory_size()); - - for (i, b) in bools.iter().enumerate() { - assert!(!v.is_null(i)); - assert_eq!(Value::Boolean(*b), v.get(i)); - assert_eq!(ValueRef::Boolean(*b), v.get_ref(i)); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(9, arrow_arr.len()); - assert_eq!(&ArrowDataType::Boolean, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_boolean_vector_to_json() { - let vector = BooleanVector::from(vec![true, false, true, true, false, false]); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[true,false,true,true,false,false]", - serde_json::to_string(&json_value).unwrap(), - ); - } - - #[test] - fn test_serialize_boolean_vector_with_null_to_json() { - let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[true,null,false]", - serde_json::to_string(&json_value).unwrap(), - ); - } - - #[test] - fn test_boolean_vector_from_vec() { - let input = vec![false, true, false, true]; - let vec = BooleanVector::from(input.clone()); - assert_eq!(4, vec.len()); - for (i, v) in input.into_iter().enumerate() { - assert_eq!(Some(v), vec.get_data(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_vector_from_iter() { - let input = vec![Some(false), Some(true), Some(false), Some(true)]; - let vec = input.iter().collect::(); - assert_eq!(4, vec.len()); - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vec.get_data(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_vector_from_vec_option() { - let input = vec![Some(false), Some(true), None, Some(true)]; - let vec = BooleanVector::from(input.clone()); - assert_eq!(4, vec.len()); - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vec.get_data(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_vector_build_get() { - let input = [Some(true), None, Some(false)]; - let mut builder = BooleanVectorBuilder::with_capacity(3); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - assert_eq!(input.len(), vector.len()); - - let res: Vec<_> = vector.iter_data().collect(); - assert_eq!(input, &res[..]); - - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vector.get_data(i)); - assert_eq!(Value::from(v), vector.get(i)); - } - } - - #[test] - fn test_boolean_vector_validity() { - let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); - assert_eq!(1, vector.null_count()); - let validity = vector.validity(); - assert_eq!(1, validity.null_count()); - assert!(!validity.is_set(1)); - - let vector = BooleanVector::from(vec![true, false, false]); - assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); - } - - #[test] - fn test_boolean_vector_builder() { - let input = BooleanVector::from_slice(&[true, false, true]); - - let mut builder = BooleanType::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::Boolean(true)).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(BooleanVector::from_slice(&[true, false, true])); - assert_eq!(expect, vector); - } -} diff --git a/src/datatypes2/src/vectors/constant.rs b/src/datatypes2/src/vectors/constant.rs deleted file mode 100644 index 87739e9131..0000000000 --- a/src/datatypes2/src/vectors/constant.rs +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef}; -use snafu::ResultExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{Result, SerializeSnafu}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{BooleanVector, Helper, Validity, Vector, VectorRef}; - -#[derive(Clone)] -pub struct ConstantVector { - length: usize, - vector: VectorRef, -} - -impl ConstantVector { - /// Create a new [ConstantVector]. - /// - /// # Panics - /// Panics if `vector.len() != 1`. - pub fn new(vector: VectorRef, length: usize) -> Self { - assert_eq!(1, vector.len()); - - // Avoid const recursion. - if vector.is_const() { - let vec: &ConstantVector = unsafe { Helper::static_cast(&vector) }; - return Self::new(vec.inner().clone(), length); - } - Self { vector, length } - } - - pub fn inner(&self) -> &VectorRef { - &self.vector - } - - /// Returns the constant value. - pub fn get_constant_ref(&self) -> ValueRef { - self.vector.get_ref(0) - } - - pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), self.len()); - - if offsets.is_empty() { - return self.slice(0, 0); - } - - Arc::new(ConstantVector::new( - self.vector.clone(), - *offsets.last().unwrap(), - )) - } - - pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { - let length = self.len() - filter.false_count(); - if length == self.len() { - return Ok(Arc::new(self.clone())); - } - Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) - } -} - -impl Vector for ConstantVector { - fn data_type(&self) -> ConcreteDataType { - self.vector.data_type() - } - - fn vector_type_name(&self) -> String { - "ConstantVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.length - } - - fn to_arrow_array(&self) -> ArrayRef { - let v = self.vector.replicate(&[self.length]); - v.to_arrow_array() - } - - fn to_boxed_arrow_array(&self) -> Box { - let v = self.vector.replicate(&[self.length]); - v.to_boxed_arrow_array() - } - - fn is_const(&self) -> bool { - true - } - - fn validity(&self) -> Validity { - if self.vector.is_null(0) { - Validity::all_null(self.length) - } else { - Validity::all_valid(self.length) - } - } - - fn memory_size(&self) -> usize { - self.vector.memory_size() - } - - fn is_null(&self, _row: usize) -> bool { - self.vector.is_null(0) - } - - fn only_null(&self) -> bool { - self.vector.is_null(0) - } - - fn slice(&self, _offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - vector: self.vector.clone(), - length, - }) - } - - fn get(&self, _index: usize) -> Value { - self.vector.get(0) - } - - fn get_ref(&self, _index: usize) -> ValueRef { - self.vector.get_ref(0) - } - - fn null_count(&self) -> usize { - if self.only_null() { - self.len() - } else { - 0 - } - } -} - -impl fmt::Debug for ConstantVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "ConstantVector([{:?}; {}])", self.get(0), self.len()) - } -} - -impl Serializable for ConstantVector { - fn serialize_to_json(&self) -> Result> { - std::iter::repeat(self.get(0)) - .take(self.len()) - .map(serde_json::Value::try_from) - .collect::>() - .context(SerializeSnafu) - } -} - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType as ArrowDataType; - - use super::*; - use crate::vectors::Int32Vector; - - #[test] - fn test_constant_vector_misc() { - let a = Int32Vector::from_slice(vec![1]); - let c = ConstantVector::new(Arc::new(a), 10); - - assert_eq!("ConstantVector", c.vector_type_name()); - assert!(c.is_const()); - assert_eq!(10, c.len()); - assert!(c.validity().is_all_valid()); - assert!(!c.only_null()); - assert_eq!(64, c.memory_size()); - - for i in 0..10 { - assert!(!c.is_null(i)); - assert_eq!(Value::Int32(1), c.get(i)); - } - - let arrow_arr = c.to_arrow_array(); - assert_eq!(10, arrow_arr.len()); - assert_eq!(&ArrowDataType::Int32, arrow_arr.data_type()); - } - - #[test] - fn test_debug_null_array() { - let a = Int32Vector::from_slice(vec![1]); - let c = ConstantVector::new(Arc::new(a), 10); - - let s = format!("{:?}", c); - assert_eq!(s, "ConstantVector([Int32(1); 10])"); - } - - #[test] - fn test_serialize_json() { - let a = Int32Vector::from_slice(vec![1]); - let c = ConstantVector::new(Arc::new(a), 10); - - let s = serde_json::to_string(&c.serialize_to_json().unwrap()).unwrap(); - assert_eq!(s, "[1,1,1,1,1,1,1,1,1,1]"); - } -} diff --git a/src/datatypes2/src/vectors/date.rs b/src/datatypes2/src/vectors/date.rs deleted file mode 100644 index d0a66b80fb..0000000000 --- a/src/datatypes2/src/vectors/date.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::types::DateType; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - -// Vector for [`Date`](common_time::Date). -pub type DateVector = PrimitiveVector; -// Builder to build DateVector. -pub type DateVectorBuilder = PrimitiveVectorBuilder; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use arrow::array::Array; - use common_time::date::Date; - - use super::*; - use crate::data_type::DataType; - use crate::scalars::{ScalarVector, ScalarVectorBuilder}; - use crate::serialize::Serializable; - use crate::types::DateType; - use crate::value::{Value, ValueRef}; - use crate::vectors::{Vector, VectorRef}; - - #[test] - fn test_build_date_vector() { - let mut builder = DateVectorBuilder::with_capacity(4); - builder.push(Some(Date::new(1))); - builder.push(None); - builder.push(Some(Date::new(-1))); - let vector = builder.finish(); - assert_eq!(3, vector.len()); - assert_eq!(Value::Date(Date::new(1)), vector.get(0)); - assert_eq!(ValueRef::Date(Date::new(1)), vector.get_ref(0)); - assert_eq!(Some(Date::new(1)), vector.get_data(0)); - assert_eq!(None, vector.get_data(1)); - assert_eq!(Value::Null, vector.get(1)); - assert_eq!(ValueRef::Null, vector.get_ref(1)); - assert_eq!(Some(Date::new(-1)), vector.get_data(2)); - let mut iter = vector.iter_data(); - assert_eq!(Some(Date::new(1)), iter.next().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!(Some(Date::new(-1)), iter.next().unwrap()); - } - - #[test] - fn test_date_scalar() { - let vector = DateVector::from_slice(&[1, 2]); - assert_eq!(2, vector.len()); - assert_eq!(Some(Date::new(1)), vector.get_data(0)); - assert_eq!(Some(Date::new(2)), vector.get_data(1)); - } - - #[test] - fn test_date_vector_builder() { - let input = DateVector::from_slice(&[1, 2, 3]); - - let mut builder = DateType::default().create_mutable_vector(3); - builder - .push_value_ref(ValueRef::Date(Date::new(5))) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); - assert_eq!(expect, vector); - } - - #[test] - fn test_date_from_arrow() { - let vector = DateVector::from_slice(&[1, 2]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } - - #[test] - fn test_serialize_date_vector() { - let vector = DateVector::from_slice(&[-1, 0, 1]); - let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!( - r#"["1969-12-31","1970-01-01","1970-01-02"]"#, - serialized_json - ); - } -} diff --git a/src/datatypes2/src/vectors/datetime.rs b/src/datatypes2/src/vectors/datetime.rs deleted file mode 100644 index a40a3e54d3..0000000000 --- a/src/datatypes2/src/vectors/datetime.rs +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::types::DateTimeType; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - -/// Vector of [`DateTime`](common_time::Date) -pub type DateTimeVector = PrimitiveVector; -/// Builder for [`DateTimeVector`]. -pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use arrow::array::{Array, PrimitiveArray}; - use common_time::DateTime; - use datafusion_common::from_slice::FromSlice; - - use super::*; - use crate::data_type::DataType; - use crate::prelude::{ - ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, - }; - use crate::serialize::Serializable; - - #[test] - fn test_datetime_vector() { - let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); - assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); - assert_eq!(3, v.len()); - assert_eq!("DateTimeVector", v.vector_type_name()); - assert_eq!( - &arrow::datatypes::DataType::Date64, - v.to_arrow_array().data_type() - ); - - assert_eq!(Some(DateTime::new(1)), v.get_data(0)); - assert_eq!(Value::DateTime(DateTime::new(1)), v.get(0)); - assert_eq!(ValueRef::DateTime(DateTime::new(1)), v.get_ref(0)); - - let mut iter = v.iter_data(); - assert_eq!(Some(DateTime::new(1)), iter.next().unwrap()); - assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); - assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); - assert!(!v.is_null(0)); - assert_eq!(64, v.memory_size()); - - if let Value::DateTime(d) = v.get(0) { - assert_eq!(1, d.val()); - } else { - unreachable!() - } - assert_eq!( - "[\"1970-01-01 00:00:01\",\"1970-01-01 00:00:02\",\"1970-01-01 00:00:03\"]", - serde_json::to_string(&v.serialize_to_json().unwrap()).unwrap() - ); - } - - #[test] - fn test_datetime_vector_builder() { - let mut builder = DateTimeVectorBuilder::with_capacity(3); - builder.push(Some(DateTime::new(1))); - builder.push(None); - builder.push(Some(DateTime::new(-1))); - - let v = builder.finish(); - assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); - assert_eq!(Value::DateTime(DateTime::new(1)), v.get(0)); - assert_eq!(Value::Null, v.get(1)); - assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); - - let input = DateTimeVector::from_wrapper_slice(&[ - DateTime::new(1), - DateTime::new(2), - DateTime::new(3), - ]); - - let mut builder = DateTimeType::default().create_mutable_vector(3); - builder - .push_value_ref(ValueRef::DateTime(DateTime::new(5))) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ - DateTime::new(5), - DateTime::new(2), - DateTime::new(3), - ])); - assert_eq!(expect, vector); - } - - #[test] - fn test_datetime_from_arrow() { - let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } -} diff --git a/src/datatypes2/src/vectors/eq.rs b/src/datatypes2/src/vectors/eq.rs deleted file mode 100644 index 55359026d4..0000000000 --- a/src/datatypes2/src/vectors/eq.rs +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use crate::data_type::DataType; -use crate::types::TimestampType; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, - StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, - TimestampNanosecondVector, TimestampSecondVector, Vector, -}; -use crate::with_match_primitive_type_id; - -impl Eq for dyn Vector + '_ {} - -impl PartialEq for dyn Vector + '_ { - fn eq(&self, other: &dyn Vector) -> bool { - equal(self, other) - } -} - -impl PartialEq for Arc { - fn eq(&self, other: &dyn Vector) -> bool { - equal(&**self, other) - } -} - -macro_rules! is_vector_eq { - ($VectorType: ident, $lhs: ident, $rhs: ident) => {{ - let lhs = $lhs.as_any().downcast_ref::<$VectorType>().unwrap(); - let rhs = $rhs.as_any().downcast_ref::<$VectorType>().unwrap(); - - lhs == rhs - }}; -} - -fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { - if lhs.data_type() != rhs.data_type() || lhs.len() != rhs.len() { - return false; - } - - if lhs.is_const() || rhs.is_const() { - // Length has been checked before, so we only need to compare inner - // vector here. - return equal( - &**lhs - .as_any() - .downcast_ref::() - .unwrap() - .inner(), - &**lhs - .as_any() - .downcast_ref::() - .unwrap() - .inner(), - ); - } - - use crate::data_type::ConcreteDataType::*; - - let lhs_type = lhs.data_type(); - match lhs.data_type() { - Null(_) => true, - Boolean(_) => is_vector_eq!(BooleanVector, lhs, rhs), - Binary(_) => is_vector_eq!(BinaryVector, lhs, rhs), - String(_) => is_vector_eq!(StringVector, lhs, rhs), - Date(_) => is_vector_eq!(DateVector, lhs, rhs), - DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), - Timestamp(t) => match t { - TimestampType::Second(_) => { - is_vector_eq!(TimestampSecondVector, lhs, rhs) - } - TimestampType::Millisecond(_) => { - is_vector_eq!(TimestampMillisecondVector, lhs, rhs) - } - TimestampType::Microsecond(_) => { - is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) - } - TimestampType::Nanosecond(_) => { - is_vector_eq!(TimestampNanosecondVector, lhs, rhs) - } - }, - List(_) => is_vector_eq!(ListVector, lhs, rhs), - UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) - | Float32(_) | Float64(_) => { - with_match_primitive_type_id!(lhs_type.logical_type_id(), |$T| { - let lhs = lhs.as_any().downcast_ref::>().unwrap(); - let rhs = rhs.as_any().downcast_ref::>().unwrap(); - - lhs == rhs - }, - { - unreachable!("should not compare {} with {}", lhs.vector_type_name(), rhs.vector_type_name()) - }) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::vectors::{ - list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, - NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, - }; - - fn assert_vector_ref_eq(vector: VectorRef) { - let rhs = vector.clone(); - assert_eq!(vector, rhs); - assert_dyn_vector_eq(&*vector, &*rhs); - } - - fn assert_dyn_vector_eq(lhs: &dyn Vector, rhs: &dyn Vector) { - assert_eq!(lhs, rhs); - } - - fn assert_vector_ref_ne(lhs: VectorRef, rhs: VectorRef) { - assert_ne!(lhs, rhs); - } - - #[test] - fn test_vector_eq() { - assert_vector_ref_eq(Arc::new(BinaryVector::from(vec![ - Some(b"hello".to_vec()), - Some(b"world".to_vec()), - ]))); - assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); - assert_vector_ref_eq(Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - ))); - assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); - assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); - assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ - 100, 120, - ]))); - assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ - 100, 120, - ]))); - assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); - - let list_vector = list::tests::new_list_vector(&[ - Some(vec![Some(1), Some(2)]), - None, - Some(vec![Some(3), Some(4)]), - ]); - assert_vector_ref_eq(Arc::new(list_vector)); - - assert_vector_ref_eq(Arc::new(NullVector::new(4))); - assert_vector_ref_eq(Arc::new(StringVector::from(vec![ - Some("hello"), - Some("world"), - ]))); - - assert_vector_ref_eq(Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt8Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Int16Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt16Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt32Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Int64Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4]))); - assert_vector_ref_eq(Arc::new(Float32Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]))); - assert_vector_ref_eq(Arc::new(Float64Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]))); - } - - #[test] - fn test_vector_ne() { - assert_vector_ref_ne( - Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), - Arc::new(Int32Vector::from_slice(&[1, 2])), - ); - assert_vector_ref_ne( - Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), - Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4])), - ); - assert_vector_ref_ne( - Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), - Arc::new(BooleanVector::from(vec![true, true])), - ); - assert_vector_ref_ne( - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - )), - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 4, - )), - ); - assert_vector_ref_ne( - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - )), - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![false])), - 4, - )), - ); - assert_vector_ref_ne( - Arc::new(ConstantVector::new( - Arc::new(BooleanVector::from(vec![true])), - 5, - )), - Arc::new(ConstantVector::new( - Arc::new(Int32Vector::from_slice(vec![1])), - 4, - )), - ); - assert_vector_ref_ne(Arc::new(NullVector::new(5)), Arc::new(NullVector::new(8))); - } -} diff --git a/src/datatypes2/src/vectors/helper.rs b/src/datatypes2/src/vectors/helper.rs deleted file mode 100644 index f3236ca0ec..0000000000 --- a/src/datatypes2/src/vectors/helper.rs +++ /dev/null @@ -1,431 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Vector helper functions, inspired by databend Series mod - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, StringArray}; -use arrow::compute; -use arrow::compute::kernels::comparison; -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; -use datafusion_common::ScalarValue; -use snafu::{OptionExt, ResultExt}; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarVectorBuilder}; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, - Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, - ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, - TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, - UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, -}; - -/// Helper functions for `Vector`. -pub struct Helper; - -impl Helper { - /// Get a pointer to the underlying data of this vectors. - /// Can be useful for fast comparisons. - /// # Safety - /// Assumes that the `vector` is T. - pub unsafe fn static_cast(vector: &VectorRef) -> &T { - let object = vector.as_ref(); - debug_assert!(object.as_any().is::()); - &*(object as *const dyn Vector as *const T) - } - - pub fn check_get_scalar(vector: &VectorRef) -> Result<&::VectorType> { - let arr = vector - .as_any() - .downcast_ref::<::VectorType>() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - vector.vector_type_name(), - std::any::type_name::(), - ), - }); - arr - } - - pub fn check_get(vector: &VectorRef) -> Result<&T> { - let arr = vector - .as_any() - .downcast_ref::() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - vector.vector_type_name(), - std::any::type_name::(), - ), - }); - arr - } - - pub fn check_get_mutable_vector( - vector: &mut dyn MutableVector, - ) -> Result<&mut T> { - let ty = vector.data_type(); - let arr = vector - .as_mut_any() - .downcast_mut() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - ty, - std::any::type_name::(), - ), - }); - arr - } - - pub fn check_get_scalar_vector( - vector: &VectorRef, - ) -> Result<&::VectorType> { - let arr = vector - .as_any() - .downcast_ref::<::VectorType>() - .with_context(|| error::UnknownVectorSnafu { - msg: format!( - "downcast vector error, vector type: {:?}, expected vector: {:?}", - vector.vector_type_name(), - std::any::type_name::(), - ), - }); - arr - } - - /// Try to cast an arrow scalar value into vector - pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { - let vector = match value { - ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), - ScalarValue::Boolean(v) => { - ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) - } - ScalarValue::Float32(v) => { - ConstantVector::new(Arc::new(Float32Vector::from(vec![v])), length) - } - ScalarValue::Float64(v) => { - ConstantVector::new(Arc::new(Float64Vector::from(vec![v])), length) - } - ScalarValue::Int8(v) => { - ConstantVector::new(Arc::new(Int8Vector::from(vec![v])), length) - } - ScalarValue::Int16(v) => { - ConstantVector::new(Arc::new(Int16Vector::from(vec![v])), length) - } - ScalarValue::Int32(v) => { - ConstantVector::new(Arc::new(Int32Vector::from(vec![v])), length) - } - ScalarValue::Int64(v) => { - ConstantVector::new(Arc::new(Int64Vector::from(vec![v])), length) - } - ScalarValue::UInt8(v) => { - ConstantVector::new(Arc::new(UInt8Vector::from(vec![v])), length) - } - ScalarValue::UInt16(v) => { - ConstantVector::new(Arc::new(UInt16Vector::from(vec![v])), length) - } - ScalarValue::UInt32(v) => { - ConstantVector::new(Arc::new(UInt32Vector::from(vec![v])), length) - } - ScalarValue::UInt64(v) => { - ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) - } - ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { - ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - } - ScalarValue::Binary(v) - | ScalarValue::LargeBinary(v) - | ScalarValue::FixedSizeBinary(_, v) => { - ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) - } - ScalarValue::List(v, field) => { - let item_type = ConcreteDataType::try_from(field.data_type())?; - let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); - if let Some(values) = v { - let values = values - .into_iter() - .map(ScalarValue::try_into) - .collect::>()?; - let list_value = ListValue::new(Some(Box::new(values)), item_type); - builder.push(Some(ListValueRef::Ref { val: &list_value })); - } else { - builder.push(None); - } - let list_vector = builder.to_vector(); - ConstantVector::new(list_vector, length) - } - ScalarValue::Date32(v) => { - ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) - } - ScalarValue::Date64(v) => { - ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) - } - ScalarValue::TimestampSecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) - } - ScalarValue::TimestampMillisecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) - } - ScalarValue::TimestampMicrosecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) - } - ScalarValue::TimestampNanosecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) - } - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Time64(_) - | ScalarValue::IntervalYearMonth(_) - | ScalarValue::IntervalDayTime(_) - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Struct(_, _) - | ScalarValue::Dictionary(_, _) => { - return error::ConversionSnafu { - from: format!("Unsupported scalar value: {}", value), - } - .fail() - } - }; - - Ok(Arc::new(vector)) - } - - /// Try to cast an arrow array into vector - /// - /// # Panics - /// Panic if given arrow data type is not supported. - pub fn try_into_vector(array: impl AsRef) -> Result { - Ok(match array.as_ref().data_type() { - ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), - ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), - ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), - ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), - ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), - ArrowDataType::Int64 => Arc::new(Int64Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt8 => Arc::new(UInt8Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt16 => Arc::new(UInt16Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt32 => Arc::new(UInt32Vector::try_from_arrow_array(array)?), - ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), - ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), - ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), - ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), - ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), - ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), - ArrowDataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), - TimeUnit::Millisecond => { - Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) - } - TimeUnit::Microsecond => { - Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) - } - TimeUnit::Nanosecond => { - Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) - } - }, - ArrowDataType::Float16 - | ArrowDataType::Time32(_) - | ArrowDataType::Time64(_) - | ArrowDataType::Duration(_) - | ArrowDataType::Interval(_) - | ArrowDataType::Binary - | ArrowDataType::FixedSizeBinary(_) - | ArrowDataType::LargeUtf8 - | ArrowDataType::LargeList(_) - | ArrowDataType::FixedSizeList(_, _) - | ArrowDataType::Struct(_) - | ArrowDataType::Union(_, _, _) - | ArrowDataType::Dictionary(_, _) - | ArrowDataType::Decimal128(_, _) - | ArrowDataType::Decimal256(_, _) - | ArrowDataType::Map(_, _) => { - unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) - } - }) - } - - /// Try to cast slice of `arrays` to vectors. - pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { - arrays.iter().map(Self::try_into_vector).collect() - } - - /// Perform SQL like operation on `names` and a scalar `s`. - pub fn like_utf8(names: Vec, s: &str) -> Result { - let array = StringArray::from(names); - - let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - - let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; - Helper::try_into_vector(result) - } -} - -#[cfg(test)] -mod tests { - use arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }; - use arrow::datatypes::{Field, Int32Type}; - use common_time::{Date, DateTime}; - - use super::*; - use crate::value::Value; - use crate::vectors::ConcreteDataType; - - #[test] - fn test_try_into_vectors() { - let arrays: Vec = vec![ - Arc::new(Int32Array::from(vec![1])), - Arc::new(Int32Array::from(vec![2])), - Arc::new(Int32Array::from(vec![3])), - ]; - let vectors = Helper::try_into_vectors(&arrays); - assert!(vectors.is_ok()); - let vectors = vectors.unwrap(); - vectors.iter().for_each(|v| assert_eq!(1, v.len())); - assert_eq!(Value::Int32(1), vectors[0].get(0)); - assert_eq!(Value::Int32(2), vectors[1].get(0)); - assert_eq!(Value::Int32(3), vectors[2].get(0)); - } - - #[test] - fn test_try_into_date_vector() { - let vector = DateVector::from(vec![Some(1), Some(2), None]); - let arrow_array = vector.to_arrow_array(); - assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); - let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); - assert_eq!(vector.len(), vector_converted.len()); - for i in 0..vector_converted.len() { - assert_eq!(vector.get(i), vector_converted.get(i)); - } - } - - #[test] - fn test_try_from_scalar_date_value() { - let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); - assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - assert_eq!(Value::Date(Date::new(42)), vector.get(i)); - } - } - - #[test] - fn test_try_from_scalar_datetime_value() { - let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); - assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - assert_eq!(Value::DateTime(DateTime::new(42)), vector.get(i)); - } - } - - #[test] - fn test_try_from_list_value() { - let value = ScalarValue::List( - Some(vec![ - ScalarValue::Int32(Some(1)), - ScalarValue::Int32(Some(2)), - ]), - Box::new(Field::new("item", ArrowDataType::Int32, true)), - ); - let vector = Helper::try_from_scalar_value(value, 3).unwrap(); - assert_eq!( - ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - vector.data_type() - ); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - let v = vector.get(i); - let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); - assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); - } - } - - #[test] - fn test_like_utf8() { - fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { - let actual = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(*actual, StringVector::from(expected)); - } - - let names: Vec = vec!["greptime", "hello", "public", "world"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let ret = Helper::like_utf8(names.clone(), "%ll%").unwrap(); - assert_vector(vec!["hello"], &ret); - - let ret = Helper::like_utf8(names.clone(), "%time").unwrap(); - assert_vector(vec!["greptime"], &ret); - - let ret = Helper::like_utf8(names.clone(), "%ld").unwrap(); - assert_vector(vec!["world"], &ret); - - let ret = Helper::like_utf8(names, "%").unwrap(); - assert_vector(vec!["greptime", "hello", "public", "world"], &ret); - } - - fn check_try_into_vector(array: impl Array + 'static) { - let array: ArrayRef = Arc::new(array); - let vector = Helper::try_into_vector(array.clone()).unwrap(); - assert_eq!(&array, &vector.to_arrow_array()); - } - - #[test] - fn test_try_into_vector() { - check_try_into_vector(NullArray::new(2)); - check_try_into_vector(BooleanArray::from(vec![true, false])); - check_try_into_vector(LargeBinaryArray::from(vec![ - "hello".as_bytes(), - "world".as_bytes(), - ])); - check_try_into_vector(Int8Array::from(vec![1, 2, 3])); - check_try_into_vector(Int16Array::from(vec![1, 2, 3])); - check_try_into_vector(Int32Array::from(vec![1, 2, 3])); - check_try_into_vector(Int64Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); - check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); - check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); - check_try_into_vector(StringArray::from(vec!["hello", "world"])); - check_try_into_vector(Date32Array::from(vec![1, 2, 3])); - check_try_into_vector(Date64Array::from(vec![1, 2, 3])); - let data = vec![None, Some(vec![Some(6), Some(7)])]; - let list_array = ListArray::from_iter_primitive::(data); - check_try_into_vector(list_array); - check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); - } -} diff --git a/src/datatypes2/src/vectors/list.rs b/src/datatypes2/src/vectors/list.rs deleted file mode 100644 index 747e03557b..0000000000 --- a/src/datatypes2/src/vectors/list.rs +++ /dev/null @@ -1,747 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, -}; -use arrow::buffer::Buffer; -use arrow::datatypes::DataType as ArrowDataType; -use serde_json::Value as JsonValue; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::Result; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::types::ListType; -use crate::value::{ListValue, ListValueRef, Value, ValueRef}; -use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of Lists, basically backed by Arrow's `ListArray`. -#[derive(Debug, PartialEq)] -pub struct ListVector { - array: ListArray, - /// The datatype of the items in the list. - item_type: ConcreteDataType, -} - -impl ListVector { - /// Iterate elements as [VectorRef]. - pub fn values_iter(&self) -> impl Iterator>> + '_ { - self.array - .iter() - .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { - Self { - array: ListArray::from(data), - item_type, - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } -} - -impl Vector for ListVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(self.item_type.clone())) - } - - fn vector_type_name(&self) -> String { - "ListVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(ListArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(ListArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) - } - - fn get(&self, index: usize) -> Value { - if !self.array.is_valid(index) { - return Value::Null; - } - - let array = &self.array.value(index); - let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { - panic!( - "arrow array with datatype {:?} cannot converted to our vector", - array.data_type() - ) - }); - let values = (0..vector.len()) - .map(|i| vector.get(i)) - .collect::>(); - Value::List(ListValue::new( - Some(Box::new(values)), - self.item_type.clone(), - )) - } - - fn get_ref(&self, index: usize) -> ValueRef { - ValueRef::List(ListValueRef::Indexed { - vector: self, - idx: index, - }) - } -} - -impl Serializable for ListVector { - fn serialize_to_json(&self) -> Result> { - self.array - .iter() - .map(|v| match v { - None => Ok(JsonValue::Null), - Some(v) => Helper::try_into_vector(v) - .and_then(|v| v.serialize_to_json()) - .map(JsonValue::Array), - }) - .collect() - } -} - -impl From for ListVector { - fn from(array: ListArray) -> Self { - let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { - ArrowDataType::List(field) => field.data_type(), - other => panic!( - "Try to create ListVector from an arrow array with type {:?}", - other - ), - }); - Self { array, item_type } - } -} - -vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); - -pub struct ListIter<'a> { - vector: &'a ListVector, - idx: usize, -} - -impl<'a> ListIter<'a> { - fn new(vector: &'a ListVector) -> ListIter { - ListIter { vector, idx: 0 } - } -} - -impl<'a> Iterator for ListIter<'a> { - type Item = Option>; - - #[inline] - fn next(&mut self) -> Option { - if self.idx >= self.vector.len() { - return None; - } - - let idx = self.idx; - self.idx += 1; - - if self.vector.is_null(idx) { - return Some(None); - } - - Some(Some(ListValueRef::Indexed { - vector: self.vector, - idx, - })) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - (self.vector.len(), Some(self.vector.len())) - } -} - -impl ScalarVector for ListVector { - type OwnedItem = ListValue; - type RefItem<'a> = ListValueRef<'a>; - type Iter<'a> = ListIter<'a>; - type Builder = ListVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(ListValueRef::Indexed { vector: self, idx }) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - ListIter::new(self) - } -} - -// Ports from arrow's GenericListBuilder. -// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs -/// [ListVector] builder. -pub struct ListVectorBuilder { - item_type: ConcreteDataType, - offsets_builder: Int32BufferBuilder, - null_buffer_builder: NullBufferBuilder, - values_builder: Box, -} - -impl ListVectorBuilder { - /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` - /// is the number of items to pre-allocate space for in this builder. - pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { - let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); - offsets_builder.append(0); - // The actual required capacity might be greater than the capacity of the `ListVector` - // if the child vector has more than one element. - let values_builder = item_type.create_mutable_vector(capacity); - - ListVectorBuilder { - item_type, - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(capacity), - values_builder, - } - } - - /// Finish the current variable-length list vector slot. - fn finish_list(&mut self, is_valid: bool) { - self.offsets_builder - .append(i32::try_from(self.values_builder.len()).unwrap()); - self.null_buffer_builder.append(is_valid); - } - - fn push_null(&mut self) { - self.finish_list(false); - } - - fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { - if let Some(items) = list_value.items() { - for item in &**items { - self.values_builder.push_value_ref(item.as_value_ref())?; - } - } - - self.finish_list(true); - Ok(()) - } -} - -impl MutableVector for ListVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::list_datatype(self.item_type.clone()) - } - - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - if let Some(list_ref) = value.as_list()? { - match list_ref { - ListValueRef::Indexed { vector, idx } => match vector.get(idx).as_list()? { - Some(list_value) => self.push_list_value(list_value)?, - None => self.push_null(), - }, - ListValueRef::Ref { val } => self.push_list_value(val)?, - } - } else { - self.push_null(); - } - - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - for idx in offset..offset + length { - let value = vector.get_ref(idx); - self.push_value_ref(value)?; - } - - Ok(()) - } -} - -impl ScalarVectorBuilder for ListVectorBuilder { - type VectorType = ListVector; - - fn with_capacity(_capacity: usize) -> Self { - panic!("Must use ListVectorBuilder::with_type_capacity()"); - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - // We expect the input ListValue has the same inner type as the builder when using - // push(), so just panic if `push_value_ref()` returns error, which indicate an - // invalid input value type. - self.push_value_ref(value.into()).unwrap_or_else(|e| { - panic!( - "Failed to push value, expect value type {:?}, err:{}", - self.item_type, e - ); - }); - } - - fn finish(&mut self) -> Self::VectorType { - let len = self.len(); - let values_vector = self.values_builder.to_vector(); - let values_arr = values_vector.to_arrow_array(); - let values_data = values_arr.data(); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.null_buffer_builder.finish(); - // Re-initialize the offsets_builder. - self.offsets_builder.append(0); - let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); - let array_data_builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer); - - let array_data = unsafe { array_data_builder.build_unchecked() }; - let array = ListArray::from(array_data); - - ListVector { - array, - item_type: self.item_type.clone(), - } - } -} - -// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs -/// Builder for creating the null bit buffer. -/// This builder only materializes the buffer when we append `false`. -/// If you only append `true`s to the builder, what you get will be -/// `None` when calling [`finish`](#method.finish). -/// This optimization is **very** important for the performance. -#[derive(Debug)] -struct NullBufferBuilder { - bitmap_builder: Option, - /// Store the length of the buffer before materializing. - len: usize, - capacity: usize, -} - -impl NullBufferBuilder { - /// Creates a new empty builder. - /// `capacity` is the number of bits in the null buffer. - fn new(capacity: usize) -> Self { - Self { - bitmap_builder: None, - len: 0, - capacity, - } - } - - fn len(&self) -> usize { - if let Some(b) = &self.bitmap_builder { - b.len() - } else { - self.len - } - } - - /// Appends a `true` into the builder - /// to indicate that this item is not null. - #[inline] - fn append_non_null(&mut self) { - if let Some(buf) = self.bitmap_builder.as_mut() { - buf.append(true) - } else { - self.len += 1; - } - } - - /// Appends a `false` into the builder - /// to indicate that this item is null. - #[inline] - fn append_null(&mut self) { - self.materialize_if_needed(); - self.bitmap_builder.as_mut().unwrap().append(false); - } - - /// Appends a boolean value into the builder. - #[inline] - fn append(&mut self, not_null: bool) { - if not_null { - self.append_non_null() - } else { - self.append_null() - } - } - - /// Builds the null buffer and resets the builder. - /// Returns `None` if the builder only contains `true`s. - fn finish(&mut self) -> Option { - let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); - self.bitmap_builder = None; - self.len = 0; - buf - } - - #[inline] - fn materialize_if_needed(&mut self) { - if self.bitmap_builder.is_none() { - self.materialize() - } - } - - #[cold] - fn materialize(&mut self) { - if self.bitmap_builder.is_none() { - let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); - b.append_n(self.len, true); - self.bitmap_builder = Some(b); - } - } -} - -#[cfg(test)] -pub mod tests { - use arrow::array::{Int32Array, Int32Builder, ListBuilder}; - use serde_json::json; - - use super::*; - use crate::scalars::ScalarRef; - use crate::types::ListType; - use crate::vectors::Int32Vector; - - pub fn new_list_vector(data: &[Option>>]) -> ListVector { - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); - for vec_opt in data { - if let Some(vec) = vec_opt { - let values = vec.iter().map(|v| Value::from(*v)).collect(); - let values = Some(Box::new(values)); - let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); - - builder.push(Some(ListValueRef::Ref { val: &list_value })); - } else { - builder.push(None); - } - } - - builder.finish() - } - - fn new_list_array(data: &[Option>>]) -> ListArray { - let mut builder = ListBuilder::new(Int32Builder::new()); - for vec_opt in data { - if let Some(vec) = vec_opt { - for value_opt in vec { - builder.values().append_option(*value_opt); - } - - builder.append(true); - } else { - builder.append(false); - } - } - - builder.finish() - } - - #[test] - fn test_list_vector() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let list_vector = new_list_vector(&data); - - assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), - list_vector.data_type() - ); - assert_eq!("ListVector", list_vector.vector_type_name()); - assert_eq!(3, list_vector.len()); - assert!(!list_vector.is_null(0)); - assert!(list_vector.is_null(1)); - assert!(!list_vector.is_null(2)); - - let arrow_array = new_list_array(&data); - assert_eq!( - arrow_array, - *list_vector - .to_arrow_array() - .as_any() - .downcast_ref::() - .unwrap() - ); - let validity = list_vector.validity(); - assert!(!validity.is_all_null()); - assert!(!validity.is_all_valid()); - assert!(validity.is_set(0)); - assert!(!validity.is_set(1)); - assert!(validity.is_set(2)); - assert_eq!(256, list_vector.memory_size()); - - let slice = list_vector.slice(0, 2).to_arrow_array(); - let sliced_array = slice.as_any().downcast_ref::().unwrap(); - assert_eq!( - Int32Array::from_iter_values([1, 2, 3]), - *sliced_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - ); - assert!(sliced_array.is_null(1)); - - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![ - Value::Int32(1), - Value::Int32(2), - Value::Int32(3) - ])), - ConcreteDataType::int32_datatype() - )), - list_vector.get(0) - ); - let value_ref = list_vector.get_ref(0); - assert!(matches!( - value_ref, - ValueRef::List(ListValueRef::Indexed { .. }) - )); - let value_ref = list_vector.get_ref(1); - if let ValueRef::List(ListValueRef::Indexed { idx, .. }) = value_ref { - assert_eq!(1, idx); - } else { - unreachable!() - } - assert_eq!(Value::Null, list_vector.get(1)); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![ - Value::Int32(4), - Value::Null, - Value::Int32(6) - ])), - ConcreteDataType::int32_datatype() - )), - list_vector.get(2) - ); - } - - #[test] - fn test_from_arrow_array() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let arrow_array = new_list_array(&data); - let array_ref: ArrayRef = Arc::new(arrow_array); - let expect = new_list_vector(&data); - - // Test try from ArrayRef - let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); - assert_eq!(expect, list_vector); - - // Test from - let arrow_array = new_list_array(&data); - let list_vector = ListVector::from(arrow_array); - assert_eq!(expect, list_vector); - } - - #[test] - fn test_iter_list_vector_values() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let list_vector = new_list_vector(&data); - - assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), - list_vector.data_type() - ); - let mut iter = list_vector.values_iter(); - assert_eq!( - Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, - *iter.next().unwrap().unwrap().unwrap() - ); - assert!(iter.next().unwrap().unwrap().is_none()); - assert_eq!( - Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, - *iter.next().unwrap().unwrap().unwrap(), - ); - assert!(iter.next().is_none()) - } - - #[test] - fn test_serialize_to_json() { - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), - ]; - - let list_vector = new_list_vector(&data); - assert_eq!( - vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], - list_vector.serialize_to_json().unwrap() - ); - } - - #[test] - fn test_list_vector_builder() { - let mut builder = - ListType::new(ConcreteDataType::int32_datatype()).create_mutable_vector(3); - builder - .push_value_ref(ValueRef::List(ListValueRef::Ref { - val: &ListValue::new( - Some(Box::new(vec![ - Value::Int32(4), - Value::Null, - Value::Int32(6), - ])), - ConcreteDataType::int32_datatype(), - ), - })) - .unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(7), Some(8), None]), - ]; - let input = new_list_vector(&data); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(new_list_vector(&[ - Some(vec![Some(4), None, Some(6)]), - None, - Some(vec![Some(7), Some(8), None]), - ])); - assert_eq!(expect, vector); - } - - #[test] - fn test_list_vector_for_scalar() { - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 2); - builder.push(None); - builder.push(Some(ListValueRef::Ref { - val: &ListValue::new( - Some(Box::new(vec![ - Value::Int32(4), - Value::Null, - Value::Int32(6), - ])), - ConcreteDataType::int32_datatype(), - ), - })); - let vector = builder.finish(); - - let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); - assert_eq!(expect, vector); - - assert!(vector.get_data(0).is_none()); - assert_eq!( - ListValueRef::Indexed { - vector: &vector, - idx: 1 - }, - vector.get_data(1).unwrap() - ); - assert_eq!( - *vector.get(1).as_list().unwrap().unwrap(), - vector.get_data(1).unwrap().to_owned_scalar() - ); - - let mut iter = vector.iter_data(); - assert!(iter.next().unwrap().is_none()); - assert_eq!( - ListValueRef::Indexed { - vector: &vector, - idx: 1 - }, - iter.next().unwrap().unwrap() - ); - assert!(iter.next().is_none()); - - let mut iter = vector.iter_data(); - assert_eq!(2, iter.size_hint().0); - assert_eq!( - ListValueRef::Indexed { - vector: &vector, - idx: 1 - }, - iter.nth(1).unwrap().unwrap() - ); - } -} diff --git a/src/datatypes2/src/vectors/null.rs b/src/datatypes2/src/vectors/null.rs deleted file mode 100644 index bb66e09b39..0000000000 --- a/src/datatypes2/src/vectors/null.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt; -use std::sync::Arc; - -use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; -use snafu::{ensure, OptionExt}; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::serialize::Serializable; -use crate::types::NullType; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// A vector where all elements are nulls. -#[derive(PartialEq)] -pub struct NullVector { - array: NullArray, -} - -// TODO(yingwen): Support null vector with other logical types. -impl NullVector { - /// Create a new `NullVector` with `n` elements. - pub fn new(n: usize) -> Self { - Self { - array: NullArray::new(n), - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } -} - -impl From for NullVector { - fn from(array: NullArray) -> Self { - Self { array } - } -} - -impl Vector for NullVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::Null(NullType::default()) - } - - fn vector_type_name(&self) -> String { - "NullVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. - let data = self.to_array_data(); - Arc::new(NullArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(NullArray::from(data)) - } - - fn validity(&self) -> Validity { - Validity::all_null(self.array.len()) - } - - fn memory_size(&self) -> usize { - 0 - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, _row: usize) -> bool { - true - } - - fn only_null(&self) -> bool { - true - } - - fn slice(&self, _offset: usize, length: usize) -> VectorRef { - Arc::new(Self::new(length)) - } - - fn get(&self, _index: usize) -> Value { - // Skips bound check for null array. - Value::Null - } - - fn get_ref(&self, _index: usize) -> ValueRef { - // Skips bound check for null array. - ValueRef::Null - } -} - -impl fmt::Debug for NullVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "NullVector({})", self.len()) - } -} - -impl Serializable for NullVector { - fn serialize_to_json(&self) -> Result> { - Ok(std::iter::repeat(serde_json::Value::Null) - .take(self.len()) - .collect()) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(NullArray, NullVector); - -#[derive(Default)] -pub struct NullVectorBuilder { - length: usize, -} - -impl MutableVector for NullVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::null_datatype() - } - - fn len(&self) -> usize { - self.length - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - let vector = Arc::new(NullVector::new(self.length)); - self.length = 0; - vector - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - ensure!( - value.is_null(), - error::CastTypeSnafu { - msg: format!("Failed to cast value ref {:?} to null", value), - } - ); - - self.length += 1; - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to NullVector", - vector.vector_type_name() - ), - })?; - assert!( - offset + length <= vector.len(), - "offset {} + length {} must less than {}", - offset, - length, - vector.len() - ); - - self.length += length; - Ok(()) - } -} - -pub(crate) fn replicate_null(vector: &NullVector, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.slice(0, 0); - } - - Arc::new(NullVector::new(*offsets.last().unwrap())) -} - -#[cfg(test)] -mod tests { - use serde_json; - - use super::*; - use crate::data_type::DataType; - - #[test] - fn test_null_vector_misc() { - let v = NullVector::new(32); - - assert_eq!(v.len(), 32); - assert_eq!(0, v.memory_size()); - let arrow_arr = v.to_arrow_array(); - assert_eq!(arrow_arr.null_count(), 32); - - let array2 = arrow_arr.slice(8, 16); - assert_eq!(array2.len(), 16); - assert_eq!(array2.null_count(), 16); - - assert_eq!("NullVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_null()); - assert!(v.only_null()); - - for i in 0..32 { - assert!(v.is_null(i)); - assert_eq!(Value::Null, v.get(i)); - assert_eq!(ValueRef::Null, v.get_ref(i)); - } - } - - #[test] - fn test_debug_null_vector() { - let array = NullVector::new(1024 * 1024); - assert_eq!(format!("{:?}", array), "NullVector(1048576)"); - } - - #[test] - fn test_serialize_json() { - let vector = NullVector::new(3); - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[null,null,null]", - serde_json::to_string(&json_value).unwrap() - ); - } - - #[test] - fn test_null_vector_validity() { - let vector = NullVector::new(5); - assert!(vector.validity().is_all_null()); - assert_eq!(5, vector.null_count()); - } - - #[test] - fn test_null_vector_builder() { - let mut builder = NullType::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::Null).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = NullVector::new(3); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(input); - assert_eq!(expect, vector); - } -} diff --git a/src/datatypes2/src/vectors/operations.rs b/src/datatypes2/src/vectors/operations.rs deleted file mode 100644 index 70ddb4a031..0000000000 --- a/src/datatypes2/src/vectors/operations.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod filter; -mod find_unique; -mod replicate; - -use common_base::BitVec; - -use crate::error::Result; -use crate::types::LogicalPrimitiveType; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{ - BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, - VectorRef, -}; - -/// Vector compute operations. -pub trait VectorOp { - /// Copies each element according `offsets` parameter. - /// - `i-th` element should be copied `offsets[i] - offsets[i - 1]` times - /// - `0-th` element would be copied `offsets[0]` times - /// - /// # Panics - /// Panics if `offsets.len() != self.len()`. - fn replicate(&self, offsets: &[usize]) -> VectorRef; - - /// Mark `i-th` bit of `selected` to `true` if the `i-th` element of `self` is unique, which - /// means there is no elements behind it have same value as it. - /// - /// The caller should ensure - /// 1. the length of `selected` bitmap is equal to `vector.len()`. - /// 2. `vector` and `prev_vector` are sorted. - /// - /// If there are multiple duplicate elements, this function retains the **first** element. - /// The first element is considered as unique if the first element of `self` is different - /// from its previous element, that is the last element of `prev_vector`. - /// - /// # Panics - /// Panics if - /// - `selected.len() < self.len()`. - /// - `prev_vector` and `self` have different data types. - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>); - - /// Filters the vector, returns elements matching the `filter` (i.e. where the values are true). - /// - /// Note that the nulls of `filter` are interpreted as `false` will lead to these elements being masked out. - fn filter(&self, filter: &BooleanVector) -> Result; -} - -macro_rules! impl_scalar_vector_op { - ($($VectorType: ident),+) => {$( - impl VectorOp for $VectorType { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_scalar(self, offsets) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap()); - find_unique::find_unique_scalar(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, $VectorType, filter) - } - } - )+}; -} - -impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); - -impl VectorOp for PrimitiveVector { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = - prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); - find_unique::find_unique_scalar(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, PrimitiveVector, filter) - } -} - -impl VectorOp for NullVector { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_null(self, offsets) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_null(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, NullVector, filter) - } -} - -impl VectorOp for ConstantVector { - fn replicate(&self, offsets: &[usize]) -> VectorRef { - self.replicate_vector(offsets) - } - - fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_constant(self, selected, prev_vector); - } - - fn filter(&self, filter: &BooleanVector) -> Result { - self.filter_vector(filter) - } -} diff --git a/src/datatypes2/src/vectors/operations/filter.rs b/src/datatypes2/src/vectors/operations/filter.rs deleted file mode 100644 index 8368a6afb4..0000000000 --- a/src/datatypes2/src/vectors/operations/filter.rs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -macro_rules! filter_non_constant { - ($vector: expr, $VectorType: ty, $filter: ident) => {{ - use std::sync::Arc; - - use arrow::compute; - use snafu::ResultExt; - - let arrow_array = $vector.as_arrow(); - let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) - .context(crate::error::ArrowComputeSnafu)?; - Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) - }}; -} - -pub(crate) use filter_non_constant; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_time::{Date, DateTime}; - - use crate::scalars::ScalarVector; - use crate::timestamp::{ - TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, - }; - use crate::types::WrapperType; - use crate::vectors::constant::ConstantVector; - use crate::vectors::{ - BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, - }; - - fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { - let v = Int32Vector::from_slice(&input); - let filter = BooleanVector::from_slice(filter); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new(Int32Vector::from_slice(&expect)); - assert_eq!(expect, out); - } - - #[test] - fn test_filter_primitive() { - check_filter_primitive(&[], &[], &[]); - check_filter_primitive(&[5], &[5], &[true]); - check_filter_primitive(&[], &[5], &[false]); - check_filter_primitive(&[], &[5, 6], &[false, false]); - check_filter_primitive(&[5, 6], &[5, 6], &[true, true]); - check_filter_primitive(&[], &[5, 6, 7], &[false, false, false]); - check_filter_primitive(&[5], &[5, 6, 7], &[true, false, false]); - check_filter_primitive(&[6], &[5, 6, 7], &[false, true, false]); - check_filter_primitive(&[7], &[5, 6, 7], &[false, false, true]); - check_filter_primitive(&[5, 7], &[5, 6, 7], &[true, false, true]); - } - - fn check_filter_constant(expect_length: usize, input_length: usize, filter: &[bool]) { - let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[123])), input_length); - let filter = BooleanVector::from_slice(filter); - let out = v.filter(&filter).unwrap(); - - assert!(out.is_const()); - assert_eq!(expect_length, out.len()); - } - - #[test] - fn test_filter_constant() { - check_filter_constant(0, 0, &[]); - check_filter_constant(1, 1, &[true]); - check_filter_constant(0, 1, &[false]); - check_filter_constant(1, 2, &[false, true]); - check_filter_constant(2, 2, &[true, true]); - check_filter_constant(1, 4, &[false, false, false, true]); - check_filter_constant(2, 4, &[false, true, false, true]); - } - - #[test] - fn test_filter_scalar() { - let v = StringVector::from_slice(&["0", "1", "2", "3"]); - let filter = BooleanVector::from_slice(&[false, true, false, true]); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["1", "3"])); - assert_eq!(expect, out); - } - - #[test] - fn test_filter_null() { - let v = NullVector::new(5); - let filter = BooleanVector::from_slice(&[false, true, false, true, true]); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new(NullVector::new(3)); - assert_eq!(expect, out); - } - - macro_rules! impl_filter_date_like_test { - ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use std::sync::Arc; - - use $crate::vectors::{$VectorType, VectorRef}; - - let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); - let filter = BooleanVector::from_slice(&[false, true, false, true, true]); - let out = v.filter(&filter).unwrap(); - - let expect: VectorRef = Arc::new($VectorType::from_iterator( - [1, 3, 4].into_iter().map($ValueType::$method), - )); - assert_eq!(expect, out); - }}; - } - - #[test] - fn test_filter_date_like() { - impl_filter_date_like_test!(DateVector, Date, new); - impl_filter_date_like_test!(DateTimeVector, DateTime, new); - - impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); - impl_filter_date_like_test!( - TimestampMillisecondVector, - TimestampMillisecond, - from_native - ); - impl_filter_date_like_test!( - TimestampMicrosecondVector, - TimestampMicrosecond, - from_native - ); - impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); - } -} diff --git a/src/datatypes2/src/vectors/operations/find_unique.rs b/src/datatypes2/src/vectors/operations/find_unique.rs deleted file mode 100644 index 7116a9e90d..0000000000 --- a/src/datatypes2/src/vectors/operations/find_unique.rs +++ /dev/null @@ -1,367 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_base::BitVec; - -use crate::scalars::ScalarVector; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{NullVector, Vector}; - -// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as -// selected when it is different from the previous one, and leaves the `selected` unchanged -// in any other case. -pub(crate) fn find_unique_scalar<'a, T: ScalarVector>( - vector: &'a T, - selected: &'a mut BitVec, - prev_vector: Option<&'a T>, -) where - T::RefItem<'a>: PartialEq, -{ - assert!(selected.len() >= vector.len()); - - if vector.is_empty() { - return; - } - - for ((i, current), next) in vector - .iter_data() - .enumerate() - .zip(vector.iter_data().skip(1)) - { - if current != next { - // If next element is a different element, we mark it as selected. - selected.set(i + 1, true); - } - } - - // Marks first element as selected if it is different from previous element, otherwise - // keep selected bitmap unchanged. - let is_first_not_duplicate = prev_vector - .map(|pv| { - if pv.is_empty() { - true - } else { - let last = pv.get_data(pv.len() - 1); - last != vector.get_data(0) - } - }) - .unwrap_or(true); - if is_first_not_duplicate { - selected.set(0, true); - } -} - -pub(crate) fn find_unique_null( - vector: &NullVector, - selected: &mut BitVec, - prev_vector: Option<&NullVector>, -) { - if vector.is_empty() { - return; - } - - let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); - if is_first_not_duplicate { - selected.set(0, true); - } -} - -pub(crate) fn find_unique_constant( - vector: &ConstantVector, - selected: &mut BitVec, - prev_vector: Option<&ConstantVector>, -) { - if vector.is_empty() { - return; - } - - let is_first_not_duplicate = prev_vector - .map(|pv| { - if pv.is_empty() { - true - } else { - vector.get_constant_ref() != pv.get_constant_ref() - } - }) - .unwrap_or(true); - - if is_first_not_duplicate { - selected.set(0, true); - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_time::{Date, DateTime}; - - use super::*; - use crate::timestamp::*; - use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; - - fn check_bitmap(expect: &[bool], selected: &BitVec) { - let actual = selected.iter().collect::>(); - assert_eq!(expect, actual); - } - - fn check_find_unique_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) { - check_find_unique_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev); - } - - fn check_find_unique_scalar_opt( - expect: &[bool], - input: impl Iterator>, - prev: Option<&[i32]>, - ) { - let input = Int32Vector::from(input.collect::>()); - let prev = prev.map(Int32Vector::from_slice); - - let mut selected = BitVec::repeat(false, input.len()); - input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); - - check_bitmap(expect, &selected); - } - - #[test] - fn test_find_unique_scalar() { - check_find_unique_scalar(&[], &[], None); - check_find_unique_scalar(&[true], &[1], None); - check_find_unique_scalar(&[true, false], &[1, 1], None); - check_find_unique_scalar(&[true, true], &[1, 2], None); - check_find_unique_scalar(&[true, true, true, true], &[1, 2, 3, 4], None); - check_find_unique_scalar(&[true, false, true, false], &[1, 1, 3, 3], None); - check_find_unique_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None); - - check_find_unique_scalar(&[true], &[5], Some(&[])); - check_find_unique_scalar(&[true], &[5], Some(&[3])); - check_find_unique_scalar(&[false], &[5], Some(&[5])); - check_find_unique_scalar(&[false], &[5], Some(&[4, 5])); - check_find_unique_scalar(&[false, true], &[5, 6], Some(&[4, 5])); - check_find_unique_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5])); - check_find_unique_scalar( - &[false, true, false, true, true], - &[5, 6, 6, 7, 8], - Some(&[4, 5]), - ); - - check_find_unique_scalar_opt( - &[true, true, false, true, false], - [Some(1), Some(2), Some(2), None, None].into_iter(), - None, - ); - } - - #[test] - fn test_find_unique_scalar_multi_times_with_prev() { - let prev = Int32Vector::from_slice(&[1]); - - let v1 = Int32Vector::from_slice(&[2, 3, 4]); - let mut selected = BitVec::repeat(false, v1.len()); - v1.find_unique(&mut selected, Some(&prev)); - - // Though element in v2 are the same as prev, but we should still keep them. - let v2 = Int32Vector::from_slice(&[1, 1, 1]); - v2.find_unique(&mut selected, Some(&prev)); - - check_bitmap(&[true, true, true], &selected); - } - - fn new_bitmap(bits: &[bool]) -> BitVec { - BitVec::from_iter(bits) - } - - #[test] - fn test_find_unique_scalar_with_prev() { - let prev = Int32Vector::from_slice(&[1]); - - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[2, 3, 4, 5]); - v.find_unique(&mut selected, Some(&prev)); - // All elements are different. - check_bitmap(&[true, true, true, true], &selected); - - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[1, 2, 3, 4]); - v.find_unique(&mut selected, Some(&prev)); - // Though first element is duplicate, but we keep the flag unchanged. - check_bitmap(&[true, true, true, true], &selected); - - // Same case as above, but now `prev` is None. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[1, 2, 3, 4]); - v.find_unique(&mut selected, None); - check_bitmap(&[true, true, true, true], &selected); - - // Same case as above, but now `prev` is empty. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = Int32Vector::from_slice(&[1, 2, 3, 4]); - v.find_unique(&mut selected, Some(&Int32Vector::from_slice(&[]))); - check_bitmap(&[true, true, true, true], &selected); - - let mut selected = new_bitmap(&[false, false, false, false]); - let v = Int32Vector::from_slice(&[2, 2, 4, 5]); - v.find_unique(&mut selected, Some(&prev)); - // only v[1] is duplicate. - check_bitmap(&[true, false, true, true], &selected); - } - - fn check_find_unique_null(len: usize) { - let input = NullVector::new(len); - let mut selected = BitVec::repeat(false, input.len()); - input.find_unique(&mut selected, None); - - let mut expect = vec![false; len]; - if !expect.is_empty() { - expect[0] = true; - } - check_bitmap(&expect, &selected); - - let mut selected = BitVec::repeat(false, input.len()); - let prev = Some(NullVector::new(1)); - input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); - let expect = vec![false; len]; - check_bitmap(&expect, &selected); - } - - #[test] - fn test_find_unique_null() { - for len in 0..5 { - check_find_unique_null(len); - } - } - - #[test] - fn test_find_unique_null_with_prev() { - let prev = NullVector::new(1); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = NullVector::new(4); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[true, false, true, false], &selected); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[false, false, true, false], &selected); - - // Prev is None, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, None); - check_bitmap(&[true, false, true, false], &selected); - - // Prev is empty, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, Some(&NullVector::new(0))); - check_bitmap(&[true, false, true, false], &selected); - } - - fn check_find_unique_constant(len: usize) { - let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len); - let mut selected = BitVec::repeat(false, len); - input.find_unique(&mut selected, None); - - let mut expect = vec![false; len]; - if !expect.is_empty() { - expect[0] = true; - } - check_bitmap(&expect, &selected); - - let mut selected = BitVec::repeat(false, len); - let prev = Some(ConstantVector::new( - Arc::new(Int32Vector::from_slice(&[8])), - 1, - )); - input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); - let expect = vec![false; len]; - check_bitmap(&expect, &selected); - } - - #[test] - fn test_find_unique_constant() { - for len in 0..5 { - check_find_unique_constant(len); - } - } - - #[test] - fn test_find_unique_constant_with_prev() { - let prev = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 1); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[true, false, true, false]); - let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 4); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[true, false, true, false], &selected); - - // Keep flags unchanged. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[false, false, true, false], &selected); - - // Prev is None, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique(&mut selected, None); - check_bitmap(&[true, false, true, false], &selected); - - // Prev is empty, select first element. - let mut selected = new_bitmap(&[false, false, true, false]); - v.find_unique( - &mut selected, - Some(&ConstantVector::new( - Arc::new(Int32Vector::from_slice(&[1])), - 0, - )), - ); - check_bitmap(&[true, false, true, false], &selected); - - // Different constant vector. - let mut selected = new_bitmap(&[false, false, true, false]); - let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[2])), 4); - v.find_unique(&mut selected, Some(&prev)); - check_bitmap(&[true, false, true, false], &selected); - } - - #[test] - fn test_find_unique_string() { - let input = StringVector::from_slice(&["a", "a", "b", "c"]); - let mut selected = BitVec::repeat(false, 4); - input.find_unique(&mut selected, None); - let expect = vec![true, false, true, true]; - check_bitmap(&expect, &selected); - } - - macro_rules! impl_find_unique_date_like_test { - ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use $crate::vectors::$VectorType; - - let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); - let mut selected = BitVec::repeat(false, 4); - v.find_unique(&mut selected, None); - let expect = vec![true, false, true, true]; - check_bitmap(&expect, &selected); - }}; - } - - #[test] - fn test_find_unique_date_like() { - impl_find_unique_date_like_test!(DateVector, Date, new); - impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); - impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); - impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); - impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); - impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); - } -} diff --git a/src/datatypes2/src/vectors/operations/replicate.rs b/src/datatypes2/src/vectors/operations/replicate.rs deleted file mode 100644 index 8216517fc6..0000000000 --- a/src/datatypes2/src/vectors/operations/replicate.rs +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::prelude::*; -pub(crate) use crate::vectors::null::replicate_null; -pub(crate) use crate::vectors::primitive::replicate_primitive; - -pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), c.len()); - - if offsets.is_empty() { - return c.slice(0, 0); - } - let mut builder = <::Builder>::with_capacity(c.len()); - - let mut previous_offset = 0; - for (i, offset) in offsets.iter().enumerate() { - let data = c.get_data(i); - for _ in previous_offset..*offset { - builder.push(data); - } - previous_offset = *offset; - } - builder.to_vector() -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_time::timestamp::TimeUnit; - use common_time::{Date, DateTime, Timestamp}; - use paste::paste; - - use super::*; - use crate::vectors::constant::ConstantVector; - use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; - - #[test] - fn test_replicate_primitive() { - let v = Int32Vector::from_iterator(0..5); - let offsets = [0, 1, 2, 3, 4]; - - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - - for i in 0..4 { - assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); - } - } - - #[test] - fn test_replicate_nullable_primitive() { - let v = Int32Vector::from(vec![None, Some(1), None, Some(2)]); - let offsets = [2, 4, 6, 8]; - let v = v.replicate(&offsets); - assert_eq!(8, v.len()); - - let expect: VectorRef = Arc::new(Int32Vector::from(vec![ - None, - None, - Some(1), - Some(1), - None, - None, - Some(2), - Some(2), - ])); - assert_eq!(expect, v); - } - - #[test] - fn test_replicate_scalar() { - let v = StringVector::from_slice(&["0", "1", "2", "3"]); - let offsets = [1, 3, 5, 6]; - - let v = v.replicate(&offsets); - assert_eq!(6, v.len()); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["0", "1", "1", "2", "2", "3"])); - assert_eq!(expect, v); - } - - #[test] - fn test_replicate_constant() { - let v = Arc::new(StringVector::from_slice(&["hello"])); - let cv = ConstantVector::new(v.clone(), 2); - let offsets = [1, 4]; - - let cv = cv.replicate(&offsets); - assert_eq!(4, cv.len()); - - let expect: VectorRef = Arc::new(ConstantVector::new(v, 4)); - assert_eq!(expect, cv); - } - - #[test] - fn test_replicate_null() { - let v = NullVector::new(0); - let offsets = []; - let v = v.replicate(&offsets); - assert!(v.is_empty()); - - let v = NullVector::new(3); - let offsets = [1, 3, 5]; - - let v = v.replicate(&offsets); - assert_eq!(5, v.len()); - } - - macro_rules! impl_replicate_date_like_test { - ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use $crate::vectors::$VectorType; - - let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); - let offsets = [0, 1, 2, 3, 4]; - - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - - for i in 0..4 { - assert_eq!( - Value::$ValueType($ValueType::$method((i as i32 + 1).into())), - v.get(i) - ); - } - }}; - } - - macro_rules! impl_replicate_timestamp_test { - ($unit: ident) => {{ - paste!{ - use $crate::vectors::[]; - use $crate::timestamp::[]; - let v = []::from_iterator((0..5).map([]::from)); - let offsets = [0, 1, 2, 3, 4]; - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - for i in 0..4 { - assert_eq!( - Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), - v.get(i) - ); - } - } - }}; - } - - #[test] - fn test_replicate_date_like() { - impl_replicate_date_like_test!(DateVector, Date, new); - impl_replicate_date_like_test!(DateTimeVector, DateTime, new); - - impl_replicate_timestamp_test!(Second); - impl_replicate_timestamp_test!(Millisecond); - impl_replicate_timestamp_test!(Microsecond); - impl_replicate_timestamp_test!(Nanosecond); - } -} diff --git a/src/datatypes2/src/vectors/primitive.rs b/src/datatypes2/src/vectors/primitive.rs deleted file mode 100644 index 7829c31731..0000000000 --- a/src/datatypes2/src/vectors/primitive.rs +++ /dev/null @@ -1,552 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::fmt; -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, -}; -use serde_json::Value as JsonValue; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, -}; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -pub type UInt8Vector = PrimitiveVector; -pub type UInt16Vector = PrimitiveVector; -pub type UInt32Vector = PrimitiveVector; -pub type UInt64Vector = PrimitiveVector; - -pub type Int8Vector = PrimitiveVector; -pub type Int16Vector = PrimitiveVector; -pub type Int32Vector = PrimitiveVector; -pub type Int64Vector = PrimitiveVector; - -pub type Float32Vector = PrimitiveVector; -pub type Float64Vector = PrimitiveVector; - -/// Vector for primitive data types. -pub struct PrimitiveVector { - array: PrimitiveArray, -} - -impl PrimitiveVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { array } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - let data = array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .data() - .clone(); - let concrete_array = PrimitiveArray::::from(data); - Ok(Self::new(concrete_array)) - } - - pub fn from_slice>(slice: P) -> Self { - let iter = slice.as_ref().iter().copied(); - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub fn from_wrapper_slice>(slice: P) -> Self { - let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub fn from_vec(array: Vec) -> Self { - Self { - array: PrimitiveArray::from_iter_values(array), - } - } - - pub fn from_values>(iter: I) -> Self { - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub(crate) fn as_arrow(&self) -> &PrimitiveArray { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> Self { - Self { - array: PrimitiveArray::from(data), - } - } - - // To distinguish with `Vector::slice()`. - fn get_slice(&self, offset: usize, length: usize) -> Self { - let data = self.array.data().slice(offset, length); - Self::from_array_data(data) - } -} - -impl Vector for PrimitiveVector { - fn data_type(&self) -> ConcreteDataType { - T::build_data_type() - } - - fn vector_type_name(&self) -> String { - format!("{}Vector", T::type_name()) - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(PrimitiveArray::::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(PrimitiveArray::::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - if self.array.is_valid(index) { - // Safety: The index have been checked by `is_valid()`. - let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; - wrapper.into() - } else { - Value::Null - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - if self.array.is_valid(index) { - // Safety: The index have been checked by `is_valid()`. - let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; - wrapper.into() - } else { - ValueRef::Null - } - } -} - -impl fmt::Debug for PrimitiveVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("PrimitiveVector") - .field("array", &self.array) - .finish() - } -} - -impl From> for PrimitiveVector { - fn from(array: PrimitiveArray) -> Self { - Self { array } - } -} - -impl From>> for PrimitiveVector { - fn from(v: Vec>) -> Self { - Self { - array: PrimitiveArray::from_iter(v), - } - } -} - -pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { - iter: ArrayIter<&'a PrimitiveArray>, -} - -impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - fn next(&mut self) -> Option> { - self.iter - .next() - .map(|item| item.map(T::Wrapper::from_native)) - } - - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } -} - -impl ScalarVector for PrimitiveVector { - type OwnedItem = T::Wrapper; - type RefItem<'a> = T::Wrapper; - type Iter<'a> = PrimitiveIter<'a, T>; - type Builder = PrimitiveVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(T::Wrapper::from_native(self.array.value(idx))) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - PrimitiveIter { - iter: self.array.iter(), - } - } -} - -impl Serializable for PrimitiveVector { - fn serialize_to_json(&self) -> Result> { - let res = self - .iter_data() - .map(|v| match v { - None => serde_json::Value::Null, - // use WrapperType's Into bound instead of - // serde_json::to_value to facilitate customized serialization - // for WrapperType - Some(v) => v.into(), - }) - .collect::>(); - Ok(res) - } -} - -impl PartialEq for PrimitiveVector { - fn eq(&self, other: &PrimitiveVector) -> bool { - self.array == other.array - } -} - -pub type UInt8VectorBuilder = PrimitiveVectorBuilder; -pub type UInt16VectorBuilder = PrimitiveVectorBuilder; -pub type UInt32VectorBuilder = PrimitiveVectorBuilder; -pub type UInt64VectorBuilder = PrimitiveVectorBuilder; - -pub type Int8VectorBuilder = PrimitiveVectorBuilder; -pub type Int16VectorBuilder = PrimitiveVectorBuilder; -pub type Int32VectorBuilder = PrimitiveVectorBuilder; -pub type Int64VectorBuilder = PrimitiveVectorBuilder; - -pub type Float32VectorBuilder = PrimitiveVectorBuilder; -pub type Float64VectorBuilder = PrimitiveVectorBuilder; - -/// Builder to build a primitive vector. -pub struct PrimitiveVectorBuilder { - mutable_array: PrimitiveBuilder, -} - -impl MutableVector for PrimitiveVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - T::build_data_type() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - let primitive = T::cast_value_ref(value)?; - match primitive { - Some(v) => self.mutable_array.append_value(v.into_native()), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let primitive = T::cast_vector(vector)?; - // Slice the underlying array to avoid creating a new Arc. - let slice = primitive.get_slice(offset, length); - for v in slice.iter_data() { - self.push(v); - } - Ok(()) - } -} - -impl ScalarVectorBuilder for PrimitiveVectorBuilder -where - T: LogicalPrimitiveType, - T::Wrapper: Scalar>, - for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, - for<'a> T::Wrapper: Scalar = T::Wrapper>, -{ - type VectorType = PrimitiveVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: PrimitiveBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array - .append_option(value.map(|v| v.into_native())); - } - - fn finish(&mut self) -> Self::VectorType { - PrimitiveVector { - array: self.mutable_array.finish(), - } - } -} - -pub(crate) fn replicate_primitive( - vector: &PrimitiveVector, - offsets: &[usize], -) -> PrimitiveVector { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.get_slice(0, 0); - } - - let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); - - let mut previous_offset = 0; - - for (offset, value) in offsets.iter().zip(vector.array.iter()) { - let repeat_times = *offset - previous_offset; - match value { - Some(data) => { - unsafe { - // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. - builder - .mutable_array - .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); - } - } - None => { - builder.mutable_array.append_nulls(repeat_times); - } - } - previous_offset = *offset; - } - builder.finish() -} - -#[cfg(test)] -mod tests { - use arrow::array::Int32Array; - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; - - use super::*; - use crate::data_type::DataType; - use crate::serialize::Serializable; - use crate::types::Int64Type; - - fn check_vec(v: Int32Vector) { - assert_eq!(4, v.len()); - assert_eq!("Int32Vector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - - for i in 0..4 { - assert!(!v.is_null(i)); - assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); - assert_eq!(ValueRef::Int32(i as i32 + 1), v.get_ref(i)); - } - - let json_value = v.serialize_to_json().unwrap(); - assert_eq!("[1,2,3,4]", serde_json::to_string(&json_value).unwrap(),); - - let arrow_arr = v.to_arrow_array(); - assert_eq!(4, arrow_arr.len()); - assert_eq!(&ArrowDataType::Int32, arrow_arr.data_type()); - } - - #[test] - fn test_from_values() { - let v = Int32Vector::from_values(vec![1, 2, 3, 4]); - check_vec(v); - } - - #[test] - fn test_from_vec() { - let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); - check_vec(v); - } - - #[test] - fn test_from_slice() { - let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); - check_vec(v); - } - - #[test] - fn test_serialize_primitive_vector_with_null_to_json() { - let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - - let json_value = vector.serialize_to_json().unwrap(); - assert_eq!( - "[1,2,null,4,null]", - serde_json::to_string(&json_value).unwrap(), - ); - } - - #[test] - fn test_from_arrow_array() { - let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); - let v = Int32Vector::from(arrow_array); - check_vec(v); - } - - #[test] - fn test_primitive_vector_build_get() { - let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - assert_eq!(input.len(), vector.len()); - - for (i, v) in input.into_iter().enumerate() { - assert_eq!(v, vector.get_data(i)); - assert_eq!(Value::from(v), vector.get(i)); - } - - let res: Vec<_> = vector.iter_data().collect(); - assert_eq!(input, &res[..]); - } - - #[test] - fn test_primitive_vector_validity() { - let input = [Some(1i32), Some(2i32), None, None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); - for v in input { - builder.push(v); - } - let vector = builder.finish(); - assert_eq!(2, vector.null_count()); - let validity = vector.validity(); - assert_eq!(2, validity.null_count()); - assert!(!validity.is_set(2)); - assert!(!validity.is_set(3)); - - let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); - assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); - } - - #[test] - fn test_memory_size() { - let v = Int32Vector::from_slice((0..5).collect::>()); - assert_eq!(64, v.memory_size()); - let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); - assert_eq!(128, v.memory_size()); - } - - #[test] - fn test_primitive_vector_builder() { - let mut builder = Int64Type::default().create_mutable_vector(3); - builder.push_value_ref(ValueRef::Int64(123)).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = Int64Vector::from_slice(&[7, 8, 9]); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); - assert_eq!(expect, vector); - } - - #[test] - fn test_from_wrapper_slice() { - macro_rules! test_from_wrapper_slice { - ($vec: ident, $ty: ident) => { - let from_wrapper_slice = $vec::from_wrapper_slice(&[ - $ty::from_native($ty::MAX), - $ty::from_native($ty::MIN), - ]); - let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); - assert_eq!(from_wrapper_slice, from_slice); - }; - } - - test_from_wrapper_slice!(UInt8Vector, u8); - test_from_wrapper_slice!(Int8Vector, i8); - test_from_wrapper_slice!(UInt16Vector, u16); - test_from_wrapper_slice!(Int16Vector, i16); - test_from_wrapper_slice!(UInt32Vector, u32); - test_from_wrapper_slice!(Int32Vector, i32); - test_from_wrapper_slice!(UInt64Vector, u64); - test_from_wrapper_slice!(Int64Vector, i64); - test_from_wrapper_slice!(Float32Vector, f32); - test_from_wrapper_slice!(Float64Vector, f64); - } -} diff --git a/src/datatypes2/src/vectors/string.rs b/src/datatypes2/src/vectors/string.rs deleted file mode 100644 index 252116b3b2..0000000000 --- a/src/datatypes2/src/vectors/string.rs +++ /dev/null @@ -1,370 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; -use snafu::ResultExt; - -use crate::arrow_array::{MutableStringArray, StringArray}; -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; -use crate::serialize::Serializable; -use crate::value::{Value, ValueRef}; -use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; - -/// Vector of strings. -#[derive(Debug, PartialEq)] -pub struct StringVector { - array: StringArray, -} - -impl StringVector { - pub(crate) fn as_arrow(&self) -> &dyn Array { - &self.array - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> Self { - Self { - array: StringArray::from(data), - } - } -} - -impl From for StringVector { - fn from(array: StringArray) -> Self { - Self { array } - } -} - -impl From>> for StringVector { - fn from(data: Vec>) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From>> for StringVector { - fn from(data: Vec>) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From<&[Option]> for StringVector { - fn from(data: &[Option]) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From<&[Option<&str>]> for StringVector { - fn from(data: &[Option<&str>]) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From> for StringVector { - fn from(data: Vec) -> Self { - Self { - array: StringArray::from_iter(data.into_iter().map(Some)), - } - } -} - -impl From> for StringVector { - fn from(data: Vec<&str>) -> Self { - Self { - array: StringArray::from_iter(data.into_iter().map(Some)), - } - } -} - -impl Vector for StringVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::string_datatype() - } - - fn vector_type_name(&self) -> String { - "StringVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(StringArray::from(data)) - } - - fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(StringArray::from(data)) - } - - fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) - } - - fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) - } - - fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) - } - - fn get_ref(&self, index: usize) -> ValueRef { - vectors::impl_get_ref_for_vector!(self.array, index) - } -} - -impl ScalarVector for StringVector { - type OwnedItem = String; - type RefItem<'a> = &'a str; - type Iter<'a> = ArrayIter<&'a StringArray>; - type Builder = StringVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - if self.array.is_valid(idx) { - Some(self.array.value(idx)) - } else { - None - } - } - - fn iter_data(&self) -> Self::Iter<'_> { - self.array.iter() - } -} - -pub struct StringVectorBuilder { - mutable_array: MutableStringArray, -} - -impl MutableVector for StringVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::string_datatype() - } - - fn len(&self) -> usize { - self.mutable_array.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_string()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) - } -} - -impl ScalarVectorBuilder for StringVectorBuilder { - type VectorType = StringVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - mutable_array: MutableStringArray::with_capacity(capacity, 0), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } - } - - fn finish(&mut self) -> Self::VectorType { - StringVector { - array: self.mutable_array.finish(), - } - } -} - -impl Serializable for StringVector { - fn serialize_to_json(&self) -> Result> { - self.iter_data() - .map(serde_json::to_value) - .collect::>() - .context(error::SerializeSnafu) - } -} - -vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType; - - use super::*; - - #[test] - fn test_string_vector_build_get() { - let mut builder = StringVectorBuilder::with_capacity(4); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let vector = builder.finish(); - - assert_eq!(Some("hello"), vector.get_data(0)); - assert_eq!(None, vector.get_data(1)); - assert_eq!(Some("world"), vector.get_data(2)); - - // Get out of bound - assert!(vector.try_get(3).is_err()); - - assert_eq!(Value::String("hello".into()), vector.get(0)); - assert_eq!(Value::Null, vector.get(1)); - assert_eq!(Value::String("world".into()), vector.get(2)); - - let mut iter = vector.iter_data(); - assert_eq!("hello", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next().unwrap()); - assert_eq!("world", iter.next().unwrap().unwrap()); - assert_eq!(None, iter.next()); - } - - #[test] - fn test_string_vector_builder() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push_value_ref(ValueRef::String("hello")).unwrap(); - assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); - - let input = StringVector::from_slice(&["world", "one", "two"]); - builder.extend_slice_of(&input, 1, 2).unwrap(); - assert!(builder - .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) - .is_err()); - let vector = builder.to_vector(); - - let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); - assert_eq!(expect, vector); - } - - #[test] - fn test_string_vector_misc() { - let strs = vec!["hello", "greptime", "rust"]; - let v = StringVector::from(strs.clone()); - assert_eq!(3, v.len()); - assert_eq!("StringVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(128, v.memory_size()); - - for (i, s) in strs.iter().enumerate() { - assert_eq!(Value::from(*s), v.get(i)); - assert_eq!(ValueRef::from(*s), v.get_ref(i)); - assert_eq!(Value::from(*s), v.try_get(i).unwrap()); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(3, arrow_arr.len()); - assert_eq!(&DataType::Utf8, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_string_vector() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let string_vector = builder.finish(); - let serialized = - serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["hello",null,"world"]"#, serialized); - } - - #[test] - fn test_from_arrow_array() { - let mut builder = MutableStringArray::new(); - builder.append_option(Some("A")); - builder.append_option(Some("B")); - builder.append_null(); - builder.append_option(Some("D")); - let string_array: StringArray = builder.finish(); - let vector = StringVector::from(string_array); - assert_eq!( - r#"["A","B",null,"D"]"#, - serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), - ); - } - - #[test] - fn test_from_non_option_string() { - let nul = String::from_utf8(vec![0]).unwrap(); - let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; - let vector = StringVector::from(corpus); - let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); - - let corpus = vec![ - "🀀🀀🀀".to_string(), - "🀁🀁🀁".to_string(), - "🀂🀂🀂".to_string(), - "🀃🀃🀃".to_string(), - "🀆🀆".to_string(), - ]; - let vector = StringVector::from(corpus); - let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); - } -} diff --git a/src/datatypes2/src/vectors/timestamp.rs b/src/datatypes2/src/vectors/timestamp.rs deleted file mode 100644 index 5d9f7f2ed1..0000000000 --- a/src/datatypes2/src/vectors/timestamp.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::types::{ - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, -}; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; - -pub type TimestampSecondVector = PrimitiveVector; -pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; - -pub type TimestampMillisecondVector = PrimitiveVector; -pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; - -pub type TimestampMicrosecondVector = PrimitiveVector; -pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; - -pub type TimestampNanosecondVector = PrimitiveVector; -pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 56065fe1c0..90c7120671 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -22,11 +22,9 @@ common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" datanode = { path = "../datanode" } datatypes = { path = "../datatypes" } futures = "0.3" @@ -44,7 +42,6 @@ servers = { path = "../servers" } session = { path = "../session" } snafu = { version = "0.7", features = ["backtraces"] } sql = { path = "../sql" } -sqlparser = "0.15" store-api = { path = "../store-api" } substrait = { path = "../common/substrait" } table = { path = "../table" } diff --git a/src/frontend/src/error.rs b/src/frontend/src/error.rs index eae56a12f8..2f40eec4b2 100644 --- a/src/frontend/src/error.rs +++ b/src/frontend/src/error.rs @@ -17,6 +17,7 @@ use std::any::Any; use common_error::prelude::*; use common_query::logical_plan::Expr; use datafusion_common::ScalarValue; +use datatypes::prelude::Value; use store_api::storage::RegionId; #[derive(Debug, Snafu)] @@ -437,6 +438,17 @@ pub enum Error { source: substrait::error::Error, }, + #[snafu(display( + "Failed to build a vector from values, value: {}, source: {}", + value, + source + ))] + BuildVector { + value: Value, + #[snafu(backtrace)] + source: datatypes::error::Error, + }, + #[snafu(display("Failed to invoke GRPC server, source: {}", source))] InvokeGrpcServer { #[snafu(backtrace)] @@ -533,6 +545,7 @@ impl ErrorExt for Error { Error::LeaderNotFound { .. } => StatusCode::StorageUnavailable, Error::TableAlreadyExist { .. } => StatusCode::TableAlreadyExists, Error::EncodeSubstraitLogicalPlan { source } => source.status_code(), + Error::BuildVector { source, .. } => source.status_code(), } } diff --git a/src/frontend/src/expr_factory.rs b/src/frontend/src/expr_factory.rs index 9f406ace0b..204eb42d92 100644 --- a/src/frontend/src/expr_factory.rs +++ b/src/frontend/src/expr_factory.rs @@ -19,9 +19,9 @@ use api::helper::ColumnDataTypeWrapper; use api::v1::{Column, ColumnDataType, CreateExpr}; use datatypes::schema::ColumnSchema; use snafu::{ensure, ResultExt}; +use sql::ast::{ColumnDef, TableConstraint}; use sql::statements::create::{CreateTable, TIME_INDEX}; use sql::statements::{column_def_to_schema, table_idents_to_full_name}; -use sqlparser::ast::{ColumnDef, TableConstraint}; use crate::error::{ BuildCreateExprOnInsertionSnafu, ColumnDataTypeSnafu, ConvertColumnDefaultConstraintSnafu, diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index 64b2bac22a..730c16d3b4 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -695,22 +695,26 @@ mod tests { .await .unwrap(); match output { - Output::Stream(stream) => { - let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - let pretty_print = recordbatches.pretty_print(); - let pretty_print = pretty_print.lines().collect::>(); - let expected = vec![ - "+----------------+---------------------+-----+--------+-----------+", - "| host | ts | cpu | memory | disk_util |", - "+----------------+---------------------+-----+--------+-----------+", - "| frontend.host1 | 1970-01-01 00:00:01 | 1.1 | 100 | 9.9 |", - "| frontend.host2 | 1970-01-01 00:00:02 | | | 9.9 |", - "| frontend.host3 | 1970-01-01 00:00:03 | 3.3 | 300 | 9.9 |", - "+----------------+---------------------+-----+--------+-----------+", - ]; + Output::RecordBatches(_) => { + unreachable!("Output::RecordBatches"); + } + Output::AffectedRows(_) => { + unreachable!("Output::AffectedRows"); + } + Output::Stream(s) => { + let batches = common_recordbatch::util::collect_batches(s).await.unwrap(); + let pretty_print = batches.pretty_print().unwrap(); + let expected = "\ ++----------------+---------------------+-----+--------+-----------+ +| host | ts | cpu | memory | disk_util | ++----------------+---------------------+-----+--------+-----------+ +| frontend.host1 | 1970-01-01T00:00:01 | 1.1 | 100 | 9.9 | +| frontend.host2 | 1970-01-01T00:00:02 | | | 9.9 | +| frontend.host3 | 1970-01-01T00:00:03 | 3.3 | 300 | 9.9 | ++----------------+---------------------+-----+--------+-----------+\ + "; assert_eq!(pretty_print, expected); } - _ => unreachable!(), }; let sql = "select * from demo where ts>cast(1000000000 as timestamp)"; // use nanoseconds as where condition @@ -718,21 +722,26 @@ mod tests { .await .unwrap(); match output { - Output::Stream(stream) => { - let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - let pretty_print = recordbatches.pretty_print(); - let pretty_print = pretty_print.lines().collect::>(); - let expected = vec![ - "+----------------+---------------------+-----+--------+-----------+", - "| host | ts | cpu | memory | disk_util |", - "+----------------+---------------------+-----+--------+-----------+", - "| frontend.host2 | 1970-01-01 00:00:02 | | | 9.9 |", - "| frontend.host3 | 1970-01-01 00:00:03 | 3.3 | 300 | 9.9 |", - "+----------------+---------------------+-----+--------+-----------+", - ]; - assert_eq!(pretty_print, expected); + Output::RecordBatches(_) => { + unreachable!("Output::RecordBatches") + } + Output::AffectedRows(_) => { + unreachable!("Output::AffectedRows") + } + Output::Stream(s) => { + let recordbatches = common_recordbatch::util::collect_batches(s).await.unwrap(); + let pretty = recordbatches.pretty_print().unwrap(); + let expected = "\ ++----------------+---------------------+-----+--------+-----------+ +| host | ts | cpu | memory | disk_util | ++----------------+---------------------+-----+--------+-----------+ +| frontend.host2 | 1970-01-01T00:00:02 | | | 9.9 | +| frontend.host3 | 1970-01-01T00:00:03 | 3.3 | 300 | 9.9 | ++----------------+---------------------+-----+--------+-----------+\ + " + .to_string(); + assert_eq!(pretty, expected); } - _ => unreachable!(), }; } @@ -787,11 +796,11 @@ mod tests { let expected_ts_col = Column { column_name: "ts".to_string(), values: Some(column::Values { - ts_millis_values: vec![1000, 2000, 3000, 4000], + ts_millisecond_values: vec![1000, 2000, 3000, 4000], ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; @@ -909,7 +918,7 @@ mod tests { }, GrpcColumnDef { name: "ts".to_string(), - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, is_nullable: true, default_constraint: None, }, diff --git a/src/frontend/src/instance/distributed.rs b/src/frontend/src/instance/distributed.rs index 2613654f8f..a44e4596fa 100644 --- a/src/frontend/src/instance/distributed.rs +++ b/src/frontend/src/instance/distributed.rs @@ -43,10 +43,10 @@ use servers::error as server_error; use servers::query_handler::{GrpcAdminHandler, GrpcQueryHandler, SqlQueryHandler}; use session::context::QueryContextRef; use snafu::{ensure, OptionExt, ResultExt}; +use sql::ast::Value as SqlValue; use sql::statements::create::Partitions; use sql::statements::sql_value_to_value; use sql::statements::statement::Statement; -use sqlparser::ast::Value as SqlValue; use table::metadata::{RawTableInfo, RawTableMeta, TableIdent, TableType}; use crate::catalog::FrontendCatalogManager; @@ -522,11 +522,12 @@ fn find_partition_columns( #[cfg(test)] mod test { + use itertools::Itertools; use servers::query_handler::SqlQueryHandlerRef; use session::context::QueryContext; + use sql::dialect::GenericDialect; use sql::parser::ParserContext; use sql::statements::statement::Statement; - use sqlparser::dialect::GenericDialect; use super::*; use crate::expr_factory::{CreateExprFactory, DefaultCreateExprFactory}; @@ -604,7 +605,9 @@ ENGINE=mito", "| public |", "| test_show_databases |", "+---------------------+", - ]; + ] + .into_iter() + .join("\n"); let expected2 = vec![ "+---------------------+", "| Schemas |", @@ -612,9 +615,10 @@ ENGINE=mito", "| test_show_databases |", "| public |", "+---------------------+", - ]; - let pretty = r.pretty_print(); - let lines = pretty.lines().collect::>(); + ] + .into_iter() + .join("\n"); + let lines = r.pretty_print().unwrap(); assert!(lines == expected1 || lines == expected2) } _ => unreachable!(), @@ -654,14 +658,12 @@ ENGINE=mito", let output = instance.do_query(sql, QueryContext::arc()).await.unwrap(); match output { Output::RecordBatches(r) => { - let expected = vec![ - "+--------------+", - "| Tables |", - "+--------------+", - "| dist_numbers |", - "+--------------+", - ]; - assert_eq!(r.pretty_print().lines().collect::>(), expected); + let expected = r#"+--------------+ +| Tables | ++--------------+ +| dist_numbers | ++--------------+"#; + assert_eq!(r.pretty_print().unwrap(), expected); } _ => unreachable!(), } diff --git a/src/frontend/src/instance/opentsdb.rs b/src/frontend/src/instance/opentsdb.rs index 842a45240e..9bcec20bb7 100644 --- a/src/frontend/src/instance/opentsdb.rs +++ b/src/frontend/src/instance/opentsdb.rs @@ -63,7 +63,7 @@ mod tests { use common_query::Output; use common_recordbatch::RecordBatches; - use datafusion::arrow_print; + use itertools::Itertools; use servers::query_handler::SqlQueryHandler; use session::context::QueryContext; @@ -134,22 +134,18 @@ mod tests { match output { Output::Stream(stream) => { let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - let recordbatches = recordbatches - .take() - .into_iter() - .map(|r| r.df_recordbatch) - .collect::>(); - let pretty_print = arrow_print::write(&recordbatches); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = recordbatches.pretty_print().unwrap(); let expected = vec![ "+---------------------+----------------+-------+-------+-------+", "| greptime_timestamp | greptime_value | tagk1 | tagk2 | tagk3 |", "+---------------------+----------------+-------+-------+-------+", - "| 1970-01-01 00:00:01 | 1 | tagv1 | tagv2 | |", - "| 1970-01-01 00:00:02 | 2 | | tagv2 | tagv3 |", - "| 1970-01-01 00:00:03 | 3 | | | |", + "| 1970-01-01T00:00:01 | 1 | tagv1 | tagv2 | |", + "| 1970-01-01T00:00:02 | 2 | | tagv2 | tagv3 |", + "| 1970-01-01T00:00:03 | 3 | | | |", "+---------------------+----------------+-------+-------+-------+", - ]; + ] + .into_iter() + .join("\n"); assert_eq!(pretty_print, expected); } _ => unreachable!(), diff --git a/src/frontend/src/mysql.rs b/src/frontend/src/mysql.rs index a0f8ef7961..87888b147b 100644 --- a/src/frontend/src/mysql.rs +++ b/src/frontend/src/mysql.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use serde::{Deserialize, Serialize}; use servers::tls::TlsOption; @@ -22,7 +20,7 @@ pub struct MysqlOptions { pub addr: String, pub runtime_size: usize, #[serde(default = "Default::default")] - pub tls: Arc, + pub tls: TlsOption, } impl Default for MysqlOptions { @@ -30,7 +28,7 @@ impl Default for MysqlOptions { Self { addr: "127.0.0.1:4002".to_string(), runtime_size: 2, - tls: Arc::new(TlsOption::default()), + tls: TlsOption::default(), } } } diff --git a/src/frontend/src/postgres.rs b/src/frontend/src/postgres.rs index c2df2f54dc..144758f315 100644 --- a/src/frontend/src/postgres.rs +++ b/src/frontend/src/postgres.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use serde::{Deserialize, Serialize}; use servers::tls::TlsOption; @@ -22,7 +20,7 @@ pub struct PostgresOptions { pub addr: String, pub runtime_size: usize, #[serde(default = "Default::default")] - pub tls: Arc, + pub tls: TlsOption, } impl Default for PostgresOptions { diff --git a/src/frontend/src/spliter.rs b/src/frontend/src/spliter.rs index eb87907651..f70116b69e 100644 --- a/src/frontend/src/spliter.rs +++ b/src/frontend/src/spliter.rs @@ -14,8 +14,10 @@ use std::collections::HashMap; +use datatypes::data_type::DataType; +use datatypes::prelude::MutableVector; use datatypes::value::Value; -use datatypes::vectors::{VectorBuilder, VectorRef}; +use datatypes::vectors::VectorRef; use snafu::{ensure, OptionExt}; use store_api::storage::RegionNumber; use table::requests::InsertRequest; @@ -125,9 +127,16 @@ fn partition_insert_request( insert: &InsertRequest, region_map: HashMap>, ) -> DistInsertRequest { - let mut dist_insert: HashMap> = + let mut dist_insert: HashMap>> = HashMap::with_capacity(region_map.len()); + let row_num = insert + .columns_values + .values() + .next() + .map(|v| v.len()) + .unwrap_or(0); + let column_count = insert.columns_values.len(); for (column_name, vector) in &insert.columns_values { for (region_id, val_idxs) in ®ion_map { @@ -136,10 +145,13 @@ fn partition_insert_request( .or_insert_with(|| HashMap::with_capacity(column_count)); let builder = region_insert .entry(column_name) - .or_insert_with(|| VectorBuilder::new(vector.data_type())); - val_idxs - .iter() - .for_each(|idx| builder.push(&vector.get(*idx))); + .or_insert_with(|| vector.data_type().create_mutable_vector(row_num)); + val_idxs.iter().for_each(|idx| { + // Safety: MutableVector is built according to column data type. + builder + .push_value_ref(vector.get(*idx).as_value_ref()) + .unwrap(); + }); } } @@ -151,7 +163,7 @@ fn partition_insert_request( .map(|(region_id, vector_map)| { let columns_values = vector_map .into_iter() - .map(|(column_name, mut builder)| (column_name.to_string(), builder.finish())) + .map(|(column_name, mut builder)| (column_name.to_string(), builder.to_vector())) .collect(); ( region_id, @@ -175,9 +187,12 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use datatypes::data_type::ConcreteDataType; - use datatypes::types::{BooleanType, StringType}; + use datatypes::prelude::ScalarVectorBuilder; + use datatypes::types::StringType; use datatypes::value::Value; - use datatypes::vectors::VectorBuilder; + use datatypes::vectors::{ + BooleanVectorBuilder, Int16VectorBuilder, MutableVector, StringVectorBuilder, + }; use serde::{Deserialize, Serialize}; use store_api::storage::RegionNumber; use table::requests::InsertRequest; @@ -339,17 +354,17 @@ mod tests { #[test] fn test_partition_values() { - let mut builder = VectorBuilder::new(ConcreteDataType::Boolean(BooleanType)); - builder.push(&true.into()); - builder.push(&false.into()); - builder.push(&true.into()); - let v1 = builder.finish(); + let mut builder = BooleanVectorBuilder::with_capacity(3); + builder.push(Some(true)); + builder.push(Some(false)); + builder.push(Some(true)); + let v1 = builder.to_vector(); - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - let v2 = builder.finish(); + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + let v2 = builder.to_vector(); let vectors = vec![v1, v2]; @@ -368,23 +383,23 @@ mod tests { fn mock_insert_request() -> InsertRequest { let mut columns_values = HashMap::with_capacity(4); - let mut builder = VectorBuilder::new(ConcreteDataType::Boolean(BooleanType)); - builder.push(&true.into()); - builder.push(&false.into()); - builder.push(&true.into()); - columns_values.insert("enable_reboot".to_string(), builder.finish()); + let mut builder = BooleanVectorBuilder::with_capacity(3); + builder.push(Some(true)); + builder.push(Some(false)); + builder.push(Some(true)); + columns_values.insert("enable_reboot".to_string(), builder.to_vector()); - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - columns_values.insert("host".to_string(), builder.finish()); + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + columns_values.insert("host".to_string(), builder.to_vector()); - let mut builder = VectorBuilder::new(ConcreteDataType::int16_datatype()); - builder.push(&1_i16.into()); - builder.push(&2_i16.into()); - builder.push(&3_i16.into()); - columns_values.insert("id".to_string(), builder.finish()); + let mut builder = Int16VectorBuilder::with_capacity(3); + builder.push(Some(1_i16)); + builder.push(Some(2_i16)); + builder.push(Some(3_i16)); + columns_values.insert("id".to_string(), builder.to_vector()); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), @@ -396,22 +411,22 @@ mod tests { fn mock_wrong_insert_request() -> InsertRequest { let mut columns_values = HashMap::with_capacity(4); - let mut builder = VectorBuilder::new(ConcreteDataType::Boolean(BooleanType)); - builder.push(&true.into()); - builder.push(&false.into()); - builder.push(&true.into()); - columns_values.insert("enable_reboot".to_string(), builder.finish()); + let mut builder = BooleanVectorBuilder::with_capacity(3); + builder.push(Some(true)); + builder.push(Some(false)); + builder.push(Some(true)); + columns_values.insert("enable_reboot".to_string(), builder.to_vector()); - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - columns_values.insert("host".to_string(), builder.finish()); + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + columns_values.insert("host".to_string(), builder.to_vector()); - let mut builder = VectorBuilder::new(ConcreteDataType::int16_datatype()); - builder.push(&1_i16.into()); + let mut builder = Int16VectorBuilder::with_capacity(1); + builder.push(Some(1_i16)); // two values are missing - columns_values.insert("id".to_string(), builder.finish()); + columns_values.insert("id".to_string(), builder.to_vector()); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), diff --git a/src/frontend/src/sql.rs b/src/frontend/src/sql.rs index f888d5e836..8814ef2bf5 100644 --- a/src/frontend/src/sql.rs +++ b/src/frontend/src/sql.rs @@ -14,15 +14,15 @@ use catalog::SchemaProviderRef; use common_error::snafu::ensure; -use datatypes::prelude::ConcreteDataType; -use datatypes::vectors::VectorBuilder; +use datatypes::data_type::DataType; +use datatypes::prelude::{ConcreteDataType, MutableVector}; use snafu::{OptionExt, ResultExt}; use sql::ast::Value as SqlValue; use sql::statements; use sql::statements::insert::Insert; use table::requests::InsertRequest; -use crate::error::{self, Result}; +use crate::error::{self, BuildVectorSnafu, Result}; // TODO(fys): Extract the common logic in datanode and frontend in the future. #[allow(dead_code)] @@ -49,7 +49,7 @@ pub(crate) fn insert_to_request( }; let rows_num = values.len(); - let mut columns_builders: Vec<(&String, &ConcreteDataType, VectorBuilder)> = + let mut columns_builders: Vec<(&String, &ConcreteDataType, Box)> = Vec::with_capacity(columns_num); if columns.is_empty() { @@ -58,7 +58,7 @@ pub(crate) fn insert_to_request( columns_builders.push(( &column_schema.name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } else { @@ -73,7 +73,7 @@ pub(crate) fn insert_to_request( columns_builders.push(( column_name, data_type, - VectorBuilder::with_capacity(data_type.clone(), rows_num), + data_type.create_mutable_vector(rows_num), )); } } @@ -100,7 +100,7 @@ pub(crate) fn insert_to_request( table_name, columns_values: columns_builders .into_iter() - .map(|(c, _, mut b)| (c.to_owned(), b.finish())) + .map(|(c, _, mut b)| (c.to_owned(), b.to_vector())) .collect(), }) } @@ -109,11 +109,12 @@ fn add_row_to_vector( column_name: &str, data_type: &ConcreteDataType, sql_val: &SqlValue, - builder: &mut VectorBuilder, + builder: &mut Box, ) -> Result<()> { let value = statements::sql_value_to_value(column_name, data_type, sql_val) .context(error::ParseSqlSnafu)?; - builder.push(&value); - + builder + .push_value_ref(value.as_value_ref()) + .context(BuildVectorSnafu { value })?; Ok(()) } diff --git a/src/frontend/src/table.rs b/src/frontend/src/table.rs index ac97d2dc3c..2d157f30a8 100644 --- a/src/frontend/src/table.rs +++ b/src/frontend/src/table.rs @@ -29,12 +29,13 @@ use common_query::physical_plan::{PhysicalPlan, PhysicalPlanRef}; use common_recordbatch::adapter::AsyncRecordBatchStreamAdapter; use common_recordbatch::{RecordBatches, SendableRecordBatchStream}; use common_telemetry::debug; -use datafusion::execution::runtime_env::RuntimeEnv; -use datafusion::logical_plan::Expr as DfExpr; +use datafusion::execution::context::TaskContext; use datafusion::physical_plan::{ Partitioning, SendableRecordBatchStream as DfSendableRecordBatchStream, }; use datafusion_common::DataFusionError; +use datafusion_expr::expr::Expr as DfExpr; +use datafusion_expr::BinaryExpr; use datatypes::prelude::Value; use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; use meta_client::rpc::{Peer, TableName}; @@ -198,7 +199,7 @@ impl DistTable { ) -> Result> { let expr = filter.df_expr(); match expr { - DfExpr::BinaryExpr { left, op, right } if is_compare_op(op) => { + DfExpr::BinaryExpr(BinaryExpr { left, op, right }) if is_compare_op(op) => { let column_op_value = match (left.as_ref(), right.as_ref()) { (DfExpr::Column(c), DfExpr::Literal(v)) => Some((&c.name, *op, v)), (DfExpr::Literal(v), DfExpr::Column(c)) => { @@ -217,7 +218,7 @@ impl DistTable { .collect::>()); } } - DfExpr::BinaryExpr { left, op, right } + DfExpr::BinaryExpr(BinaryExpr { left, op, right }) if matches!(op, Operator::And | Operator::Or) => { let left_regions = @@ -449,7 +450,7 @@ impl PhysicalPlan for DistTableScan { fn execute( &self, partition: usize, - _runtime: Arc, + _context: Arc, ) -> QueryResult { let exec = self.partition_execs[partition].clone(); let stream = Box::pin(async move { @@ -515,18 +516,20 @@ mod test { use datafusion::physical_plan::expressions::{col as physical_col, PhysicalSortExpr}; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::ExecutionPlan; + use datafusion::prelude::SessionContext; + use datafusion::sql::sqlparser; use datafusion_expr::expr_fn::{and, binary_expr, col, or}; use datafusion_expr::lit; use datanode::instance::Instance; - use datatypes::arrow::compute::sort::SortOptions; + use datatypes::arrow::compute::SortOptions; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; + use itertools::Itertools; use meta_client::client::MetaClient; use meta_client::rpc::router::RegionRoute; use meta_client::rpc::{Region, Table, TableRoute}; use sql::parser::ParserContext; use sql::statements::statement::Statement; - use sqlparser::dialect::GenericDialect; use table::metadata::{TableInfoBuilder, TableMetaBuilder}; use table::TableRef; @@ -733,7 +736,6 @@ mod test { #[tokio::test(flavor = "multi_thread")] async fn test_dist_table_scan() { let table = Arc::new(new_dist_table().await); - // should scan all regions // select a, row_id from numbers let projection = Some(vec![1, 2]); @@ -859,6 +861,7 @@ mod test { expected_partitions: usize, expected_output: Vec<&str>, ) { + let expected_output = expected_output.into_iter().join("\n"); let table_scan = table .scan(&projection, filters.as_slice(), None) .await @@ -877,21 +880,17 @@ mod test { options: SortOptions::default(), }], Arc::new(merge), + None, ) .unwrap(); assert_eq!(sort.output_partitioning().partition_count(), 1); - let stream = sort - .execute(0, Arc::new(RuntimeEnv::default())) - .await - .unwrap(); + let session_ctx = SessionContext::new(); + let stream = sort.execute(0, session_ctx.task_ctx()).unwrap(); let stream = Box::pin(RecordBatchStreamAdapter::try_new(stream).unwrap()); let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); - assert_eq!( - recordbatches.pretty_print().lines().collect::>(), - expected_output - ); + assert_eq!(recordbatches.pretty_print().unwrap(), expected_output); } async fn new_dist_table() -> DistTable { @@ -923,14 +922,16 @@ mod test { PARTITION r3 VALUES LESS THAN (MAXVALUE), ) ENGINE=mito"; - let create_table = match ParserContext::create_with_dialect(sql, &GenericDialect {}) - .unwrap() - .pop() - .unwrap() - { - Statement::CreateTable(c) => c, - _ => unreachable!(), - }; + + let create_table = + match ParserContext::create_with_dialect(sql, &sqlparser::dialect::GenericDialect {}) + .unwrap() + .pop() + .unwrap() + { + Statement::CreateTable(c) => c, + _ => unreachable!(), + }; let mut expr = DefaultCreateExprFactory .create_expr_by_stmt(&create_table) diff --git a/src/frontend/src/table/insert.rs b/src/frontend/src/table/insert.rs index 409632474f..fb23b0e792 100644 --- a/src/frontend/src/table/insert.rs +++ b/src/frontend/src/table/insert.rs @@ -107,7 +107,7 @@ pub fn insert_request_to_insert_batch(insert: &InsertRequest) -> Result<(Vec InsertRequest { let mut columns_values = HashMap::with_capacity(4); - let mut builder = VectorBuilder::new(ConcreteDataType::String(StringType)); - builder.push(&"host1".into()); - builder.push_null(); - builder.push(&"host3".into()); - columns_values.insert("host".to_string(), builder.finish()); + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("host1")); + builder.push(None); + builder.push(Some("host3")); + columns_values.insert("host".to_string(), builder.to_vector()); - let mut builder = VectorBuilder::new(ConcreteDataType::int16_datatype()); - builder.push(&1_i16.into()); - builder.push(&2_i16.into()); - builder.push(&3_i16.into()); - columns_values.insert("id".to_string(), builder.finish()); + let mut builder = Int16VectorBuilder::with_capacity(3); + builder.push(Some(1_i16)); + builder.push(Some(2_i16)); + builder.push(Some(3_i16)); + columns_values.insert("id".to_string(), builder.to_vector()); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), diff --git a/src/frontend/src/table/scan.rs b/src/frontend/src/table/scan.rs index 14ea9a6a93..3d9f623aeb 100644 --- a/src/frontend/src/table/scan.rs +++ b/src/frontend/src/table/scan.rs @@ -20,7 +20,8 @@ use client::{Database, ObjectResult}; use common_query::prelude::Expr; use common_query::Output; use common_recordbatch::{util, RecordBatches}; -use datafusion::logical_plan::{LogicalPlan, LogicalPlanBuilder}; +use datafusion::datasource::DefaultTableSource; +use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; use meta_client::rpc::TableName; use snafu::ResultExt; use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; @@ -82,7 +83,7 @@ impl DatanodeInstance { let mut builder = LogicalPlanBuilder::scan_with_filters( &table_scan.table_name.to_string(), - table_provider, + Arc::new(DefaultTableSource::new(table_provider)), table_scan.projection.clone(), table_scan .filters @@ -104,11 +105,9 @@ impl DatanodeInstance { .context(error::BuildDfLogicalPlanSnafu)?; } - if let Some(limit) = table_scan.limit { - builder = builder - .limit(limit) - .context(error::BuildDfLogicalPlanSnafu)?; - } + builder + .limit(0, table_scan.limit) + .context(error::BuildDfLogicalPlanSnafu)?; builder.build().context(error::BuildDfLogicalPlanSnafu) } diff --git a/src/mito/Cargo.toml b/src/mito/Cargo.toml index 63612075f7..583ff9f3f5 100644 --- a/src/mito/Cargo.toml +++ b/src/mito/Cargo.toml @@ -19,10 +19,8 @@ common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" log-store = { path = "../log-store" } diff --git a/src/mito/src/engine.rs b/src/mito/src/engine.rs index 48d3344821..480dd16cea 100644 --- a/src/mito/src/engine.rs +++ b/src/mito/src/engine.rs @@ -519,13 +519,14 @@ impl MitoEngineInner { #[cfg(test)] mod tests { - use common_query::physical_plan::RuntimeEnv; + use common_query::physical_plan::SessionContext; use common_recordbatch::util; - use datafusion_common::field_util::{FieldExt, SchemaExt}; - use datatypes::prelude::{ConcreteDataType, ScalarVector}; + use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, SchemaBuilder}; use datatypes::value::Value; - use datatypes::vectors::*; + use datatypes::vectors::{ + Float64Vector, Int32Vector, StringVector, TimestampMillisecondVector, VectorRef, + }; use log_store::fs::noop::NoopLogStore; use storage::config::EngineConfig as StorageEngineConfig; use storage::EngineImpl; @@ -600,30 +601,29 @@ mod tests { let (_dir, table_name, table) = setup_table_with_column_default_constraint().await; let mut columns_values: HashMap = HashMap::with_capacity(4); - let names = StringVector::from(vec!["first", "second"]); - let tss = TimestampVector::from_vec(vec![1, 2]); + let names: VectorRef = Arc::new(StringVector::from(vec!["first", "second"])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_vec(vec![1, 2])); - columns_values.insert("name".to_string(), Arc::new(names.clone())); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("name".to_string(), names.clone()); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request(table_name.to_string(), columns_values); assert_eq!(2, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - let record = &batches[0].df_recordbatch; + let record = &batches[0]; assert_eq!(record.num_columns(), 3); - let columns = record.columns(); - assert_eq!(3, columns.len()); - assert_eq!(names.to_arrow_array(), columns[0]); + assert_eq!(names, *record.column(0)); assert_eq!( - Int32Vector::from_vec(vec![42, 42]).to_arrow_array(), - columns[1] + Arc::new(Int32Vector::from_vec(vec![42, 42])) as VectorRef, + *record.column(1) ); - assert_eq!(tss.to_arrow_array(), columns[2]); + assert_eq!(tss, *record.column(2)); } #[tokio::test] @@ -631,29 +631,28 @@ mod tests { let (_dir, table_name, table) = setup_table_with_column_default_constraint().await; let mut columns_values: HashMap = HashMap::with_capacity(4); - let names = StringVector::from(vec!["first", "second"]); - let nums = Int32Vector::from(vec![None, Some(66)]); - let tss = TimestampVector::from_vec(vec![1, 2]); + let names: VectorRef = Arc::new(StringVector::from(vec!["first", "second"])); + let nums: VectorRef = Arc::new(Int32Vector::from(vec![None, Some(66)])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_vec(vec![1, 2])); - columns_values.insert("name".to_string(), Arc::new(names.clone())); - columns_values.insert("n".to_string(), Arc::new(nums.clone())); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("name".to_string(), names.clone()); + columns_values.insert("n".to_string(), nums.clone()); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request(table_name.to_string(), columns_values); assert_eq!(2, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - let record = &batches[0].df_recordbatch; + let record = &batches[0]; assert_eq!(record.num_columns(), 3); - let columns = record.columns(); - assert_eq!(3, columns.len()); - assert_eq!(names.to_arrow_array(), columns[0]); - assert_eq!(nums.to_arrow_array(), columns[1]); - assert_eq!(tss.to_arrow_array(), columns[2]); + assert_eq!(names, *record.column(0)); + assert_eq!(nums, *record.column(1)); + assert_eq!(tss, *record.column(2)); } #[test] @@ -724,73 +723,73 @@ mod tests { assert_eq!(0, table.insert(insert_req).await.unwrap()); let mut columns_values: HashMap = HashMap::with_capacity(4); - let hosts = StringVector::from(vec!["host1", "host2"]); - let cpus = Float64Vector::from_vec(vec![55.5, 66.6]); - let memories = Float64Vector::from_vec(vec![1024f64, 4096f64]); - let tss = TimestampVector::from_vec(vec![1, 2]); + let hosts: VectorRef = Arc::new(StringVector::from(vec!["host1", "host2"])); + let cpus: VectorRef = Arc::new(Float64Vector::from_vec(vec![55.5, 66.6])); + let memories: VectorRef = Arc::new(Float64Vector::from_vec(vec![1024f64, 4096f64])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_vec(vec![1, 2])); - columns_values.insert("host".to_string(), Arc::new(hosts.clone())); - columns_values.insert("cpu".to_string(), Arc::new(cpus.clone())); - columns_values.insert("memory".to_string(), Arc::new(memories.clone())); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("host".to_string(), hosts.clone()); + columns_values.insert("cpu".to_string(), cpus.clone()); + columns_values.insert("memory".to_string(), memories.clone()); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request("demo".to_string(), columns_values); assert_eq!(2, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batches[0].df_recordbatch.num_columns(), 4); + assert_eq!(batches[0].num_columns(), 4); - let arrow_schema = batches[0].schema.arrow_schema(); - assert_eq!(arrow_schema.fields().len(), 4); + let batch_schema = &batches[0].schema; + assert_eq!(batch_schema.num_columns(), 4); + assert_eq!(batch_schema.column_schemas()[0].name, "host"); + assert_eq!(batch_schema.column_schemas()[1].name, "cpu"); + assert_eq!(batch_schema.column_schemas()[2].name, "memory"); + assert_eq!(batch_schema.column_schemas()[3].name, "ts"); - assert_eq!(arrow_schema.field(0).name(), "host"); - assert_eq!(arrow_schema.field(1).name(), "cpu"); - assert_eq!(arrow_schema.field(2).name(), "memory"); - assert_eq!(arrow_schema.field(3).name(), "ts"); - - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(4, columns.len()); - assert_eq!(hosts.to_arrow_array(), columns[0]); - assert_eq!(cpus.to_arrow_array(), columns[1]); - assert_eq!(memories.to_arrow_array(), columns[2]); - assert_eq!(tss.to_arrow_array(), columns[3]); + let batch = &batches[0]; + assert_eq!(4, batch.num_columns()); + assert_eq!(hosts, *batch.column(0)); + assert_eq!(cpus, *batch.column(1)); + assert_eq!(memories, *batch.column(2)); + assert_eq!(tss, *batch.column(3)); // Scan with projections: cpu and memory let stream = table.scan(&Some(vec![1, 2]), &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batches[0].df_recordbatch.num_columns(), 2); + assert_eq!(batches[0].num_columns(), 2); - let arrow_schema = batches[0].schema.arrow_schema(); - assert_eq!(arrow_schema.fields().len(), 2); + let batch_schema = &batches[0].schema; + assert_eq!(batch_schema.num_columns(), 2); - assert_eq!(arrow_schema.field(0).name(), "cpu"); - assert_eq!(arrow_schema.field(1).name(), "memory"); + assert_eq!(batch_schema.column_schemas()[0].name, "cpu"); + assert_eq!(batch_schema.column_schemas()[1].name, "memory"); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(2, columns.len()); - assert_eq!(cpus.to_arrow_array(), columns[0]); - assert_eq!(memories.to_arrow_array(), columns[1]); + let batch = &batches[0]; + assert_eq!(2, batch.num_columns()); + assert_eq!(cpus, *batch.column(0)); + assert_eq!(memories, *batch.column(1)); // Scan with projections: only ts let stream = table.scan(&Some(vec![3]), &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); assert_eq!(1, batches.len()); - assert_eq!(batches[0].df_recordbatch.num_columns(), 1); + assert_eq!(batches[0].num_columns(), 1); - let arrow_schema = batches[0].schema.arrow_schema(); - assert_eq!(arrow_schema.fields().len(), 1); + let batch_schema = &batches[0].schema; + assert_eq!(batch_schema.num_columns(), 1); - assert_eq!(arrow_schema.field(0).name(), "ts"); + assert_eq!(batch_schema.column_schemas()[0].name, "ts"); - let columns = batches[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(tss.to_arrow_array(), columns[0]); + let record = &batches[0]; + assert_eq!(1, record.num_columns()); + assert_eq!(tss, *record.column(0)); } #[tokio::test] @@ -804,28 +803,31 @@ mod tests { // Insert more than batch size rows to the table. let test_batch_size = default_batch_size * 4; let mut columns_values: HashMap = HashMap::with_capacity(4); - let hosts = StringVector::from(vec!["host1"; test_batch_size]); - let cpus = Float64Vector::from_vec(vec![55.5; test_batch_size]); - let memories = Float64Vector::from_vec(vec![1024f64; test_batch_size]); - let tss = TimestampVector::from_values((0..test_batch_size).map(|v| v as i64)); + let hosts: VectorRef = Arc::new(StringVector::from(vec!["host1"; test_batch_size])); + let cpus: VectorRef = Arc::new(Float64Vector::from_vec(vec![55.5; test_batch_size])); + let memories: VectorRef = Arc::new(Float64Vector::from_vec(vec![1024f64; test_batch_size])); + let tss: VectorRef = Arc::new(TimestampMillisecondVector::from_values( + (0..test_batch_size).map(|v| v as i64), + )); - columns_values.insert("host".to_string(), Arc::new(hosts)); - columns_values.insert("cpu".to_string(), Arc::new(cpus)); - columns_values.insert("memory".to_string(), Arc::new(memories)); - columns_values.insert("ts".to_string(), Arc::new(tss.clone())); + columns_values.insert("host".to_string(), hosts); + columns_values.insert("cpu".to_string(), cpus); + columns_values.insert("memory".to_string(), memories); + columns_values.insert("ts".to_string(), tss.clone()); let insert_req = new_insert_request("demo".to_string(), columns_values); assert_eq!(test_batch_size, table.insert(insert_req).await.unwrap()); + let session_ctx = SessionContext::new(); let stream = table.scan(&None, &[], None).await.unwrap(); - let stream = stream.execute(0, Arc::new(RuntimeEnv::default())).unwrap(); + let stream = stream.execute(0, session_ctx.task_ctx()).unwrap(); let batches = util::collect(stream).await.unwrap(); let mut total = 0; for batch in batches { - assert_eq!(batch.df_recordbatch.num_columns(), 4); - let ts = batch.df_recordbatch.column(3); + assert_eq!(batch.num_columns(), 4); + let ts = batch.column(3); let expect = tss.slice(total, ts.len()); - assert_eq!(expect.to_arrow_array(), *ts); + assert_eq!(expect, *ts); total += ts.len(); } assert_eq!(test_batch_size, total); diff --git a/src/mito/src/manifest/action.rs b/src/mito/src/manifest/action.rs index f8428367d4..4e2ba43db4 100644 --- a/src/mito/src/manifest/action.rs +++ b/src/mito/src/manifest/action.rs @@ -26,7 +26,7 @@ use store_api::manifest::action::{ProtocolAction, ProtocolVersion, VersionHeader use store_api::manifest::{ManifestVersion, MetaAction}; use table::metadata::{RawTableInfo, TableIdent}; -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct TableChange { pub table_info: RawTableInfo, } @@ -37,7 +37,7 @@ pub struct TableRemove { pub table_name: String, } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub enum TableMetaAction { Protocol(ProtocolAction), // Boxed TableChange to reduce the total size of enum @@ -45,7 +45,7 @@ pub enum TableMetaAction { Remove(TableRemove), } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct TableMetaActionList { pub actions: Vec, pub prev_version: ManifestVersion, diff --git a/src/mito/src/table.rs b/src/mito/src/table.rs index 689a2b4c1b..d5f554a994 100644 --- a/src/mito/src/table.rs +++ b/src/mito/src/table.rs @@ -21,9 +21,10 @@ use std::sync::Arc; use arc_swap::ArcSwap; use async_trait::async_trait; +use common_error::ext::BoxedError; use common_query::logical_plan::Expr; use common_query::physical_plan::PhysicalPlanRef; -use common_recordbatch::error::{Error as RecordBatchError, Result as RecordBatchResult}; +use common_recordbatch::error::{ExternalSnafu, Result as RecordBatchResult}; use common_recordbatch::{RecordBatch, RecordBatchStream}; use common_telemetry::logging; use datatypes::schema::ColumnSchema; @@ -189,7 +190,7 @@ impl Table for MitoTable { let stream_schema = schema.clone(); let stream = Box::pin(async_stream::try_stream! { - while let Some(chunk) = reader.next_chunk().await.map_err(RecordBatchError::new)? { + while let Some(chunk) = reader.next_chunk().await.map_err(BoxedError::new).context(ExternalSnafu)? { yield RecordBatch::new(stream_schema.clone(), chunk.columns)? } }); diff --git a/src/mito/src/table/test_util/mock_engine.rs b/src/mito/src/table/test_util/mock_engine.rs index 08b137cdc7..54b845bc51 100644 --- a/src/mito/src/table/test_util/mock_engine.rs +++ b/src/mito/src/table/test_util/mock_engine.rs @@ -21,7 +21,7 @@ use arc_swap::ArcSwap; use async_trait::async_trait; use common_error::mock::MockError; use common_telemetry::logging; -use datatypes::prelude::{Value, VectorBuilder, VectorRef}; +use datatypes::prelude::{DataType, Value, VectorRef}; use datatypes::schema::{ColumnSchema, Schema}; use storage::metadata::{RegionMetaImpl, RegionMetadata}; use storage::write_batch::{Mutation, WriteBatch}; @@ -58,12 +58,11 @@ impl ChunkReader for MockChunkReader { .iter() .map(|column_schema| { let data = self.memtable.get(&column_schema.name).unwrap(); - let mut builder = - VectorBuilder::with_capacity(column_schema.data_type.clone(), data.len()); + let mut builder = column_schema.data_type.create_mutable_vector(data.len()); for v in data { - builder.push(v); + builder.push_value_ref(v.as_value_ref()).unwrap(); } - builder.finish() + builder.to_vector() }) .collect::>(); self.read = true; diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml index 9676a81a39..1bb9da358a 100644 --- a/src/query/Cargo.toml +++ b/src/query/Cargo.toml @@ -15,11 +15,12 @@ common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-physical-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" +datafusion-optimizer = "14.0.0" +datafusion-physical-expr = "14.0.0" +datafusion-sql = "14.0.0" datatypes = { path = "../datatypes" } futures = "0.3" futures-util = "0.3" diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs index 8dda26a5db..0968d99357 100644 --- a/src/query/src/datafusion.rs +++ b/src/query/src/datafusion.rs @@ -141,7 +141,6 @@ impl LogicalOptimizer for DatafusionQueryEngine { LogicalPlan::DfPlan(df_plan) => { let optimized_plan = self.state - .df_context() .optimize(df_plan) .context(error::DatafusionSnafu { msg: "Fail to optimize logical plan", @@ -163,14 +162,11 @@ impl PhysicalPlanner for DatafusionQueryEngine { let _timer = timer!(metric::METRIC_CREATE_PHYSICAL_ELAPSED); match logical_plan { LogicalPlan::DfPlan(df_plan) => { - let physical_plan = self - .state - .df_context() - .create_physical_plan(df_plan) - .await - .context(error::DatafusionSnafu { + let physical_plan = self.state.create_physical_plan(df_plan).await.context( + error::DatafusionSnafu { msg: "Fail to create physical plan", - })?; + }, + )?; Ok(Arc::new(PhysicalPlanAdapter::new( Arc::new( @@ -193,22 +189,19 @@ impl PhysicalOptimizer for DatafusionQueryEngine { plan: Arc, ) -> Result> { let _timer = timer!(metric::METRIC_OPTIMIZE_PHYSICAL_ELAPSED); - let config = &self.state.df_context().state.lock().config; - let optimizers = &config.physical_optimizers; - let mut new_plan = plan + let new_plan = plan .as_any() .downcast_ref::() .context(error::PhysicalPlanDowncastSnafu)? .df_plan(); - for optimizer in optimizers { - new_plan = optimizer - .optimize(new_plan, config) + let new_plan = + self.state + .optimize_physical_plan(new_plan) .context(error::DatafusionSnafu { msg: "Fail to optimize physical plan", })?; - } Ok(Arc::new(PhysicalPlanAdapter::new(plan.schema(), new_plan))) } } @@ -224,7 +217,7 @@ impl QueryExecutor for DatafusionQueryEngine { match plan.output_partitioning().partition_count() { 0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))), 1 => Ok(plan - .execute(0, ctx.state().runtime()) + .execute(0, ctx.state().task_ctx()) .context(error::ExecutePhysicalPlanSnafu)?), _ => { // merge into a single partition @@ -232,11 +225,11 @@ impl QueryExecutor for DatafusionQueryEngine { CoalescePartitionsExec::new(Arc::new(DfPhysicalPlanAdapter(plan.clone()))); // CoalescePartitionsExec must produce a single partition assert_eq!(1, plan.output_partitioning().partition_count()); - let df_stream = plan.execute(0, ctx.state().runtime()).await.context( - error::DatafusionSnafu { - msg: "Failed to execute DataFusion merge exec", - }, - )?; + let df_stream = + plan.execute(0, ctx.state().task_ctx()) + .context(error::DatafusionSnafu { + msg: "Failed to execute DataFusion merge exec", + })?; let stream = RecordBatchStreamAdapter::try_new(df_stream) .context(error::ConvertDfRecordBatchStreamSnafu)?; Ok(Box::pin(stream)) @@ -254,8 +247,7 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_query::Output; use common_recordbatch::util; - use datafusion::field_util::{FieldExt, SchemaExt}; - use datatypes::arrow::array::UInt64Array; + use datatypes::vectors::{UInt64Vector, VectorRef}; use session::context::QueryContext; use table::table::numbers::NumbersTable; @@ -290,10 +282,10 @@ mod tests { assert_eq!( format!("{:?}", plan), - r#"DfPlan(Limit: 20 - Projection: #SUM(numbers.number) - Aggregate: groupBy=[[]], aggr=[[SUM(#numbers.number)]] - TableScan: numbers projection=None)"# + r#"DfPlan(Limit: skip=0, fetch=20 + Projection: SUM(numbers.number) + Aggregate: groupBy=[[]], aggr=[[SUM(numbers.number)]] + TableScan: numbers)"# ); } @@ -311,20 +303,20 @@ mod tests { Output::Stream(recordbatch) => { let numbers = util::collect(recordbatch).await.unwrap(); assert_eq!(1, numbers.len()); - assert_eq!(numbers[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, numbers[0].schema.arrow_schema().fields().len()); + assert_eq!(numbers[0].num_columns(), 1); + assert_eq!(1, numbers[0].schema.num_columns()); assert_eq!( "SUM(numbers.number)", - numbers[0].schema.arrow_schema().field(0).name() + numbers[0].schema.column_schemas()[0].name ); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); + let batch = &numbers[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), 1); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt64Array::from_slice(&[4950]) + *batch.column(0), + Arc::new(UInt64Vector::from_slice(&[4950])) as VectorRef ); } _ => unreachable!(), diff --git a/src/query/src/datafusion/planner.rs b/src/query/src/datafusion/planner.rs index 6d70109e74..4c87654e3c 100644 --- a/src/query/src/datafusion/planner.rs +++ b/src/query/src/datafusion/planner.rs @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use common_query::logical_plan::create_aggregate_function; use datafusion::catalog::TableReference; -use datafusion::datasource::TableProvider; +use datafusion::error::Result as DfResult; use datafusion::physical_plan::udaf::AggregateUDF; use datafusion::physical_plan::udf::ScalarUDF; use datafusion::sql::planner::{ContextProvider, SqlToRel}; +use datafusion_expr::TableSource; use datatypes::arrow::datatypes::DataType; use session::context::QueryContextRef; use snafu::ResultExt; @@ -50,7 +52,7 @@ impl<'a, S: ContextProvider + Send + Sync> DfPlanner<'a, S> { let sql = query.inner.to_string(); let result = self .sql_to_rel - .query_to_plan(query.inner) + .query_to_plan(query.inner, &mut HashMap::new()) .context(error::PlanSqlSnafu { sql })?; Ok(LogicalPlan::DfPlan(result)) @@ -103,26 +105,14 @@ impl DfContextProviderAdapter { } } -/// TODO(dennis): Delegate all requests to ExecutionContext right now, -/// manage UDFs, UDAFs, variables by ourself in future. impl ContextProvider for DfContextProviderAdapter { - fn get_table_provider(&self, name: TableReference) -> Option> { + fn get_table_provider(&self, name: TableReference) -> DfResult> { let schema = self.query_ctx.current_schema(); - let execution_ctx = self.state.df_context().state.lock(); - match name { - TableReference::Bare { table } if schema.is_some() => { - execution_ctx.get_table_provider(TableReference::Partial { - // unwrap safety: checked in this match's arm - schema: &schema.unwrap(), - table, - }) - } - _ => execution_ctx.get_table_provider(name), - } + self.state.get_table_provider(schema.as_deref(), name) } fn get_function_meta(&self, name: &str) -> Option> { - self.state.df_context().state.lock().get_function_meta(name) + self.state.get_function_meta(name) } fn get_aggregate_meta(&self, name: &str) -> Option> { @@ -134,10 +124,6 @@ impl ContextProvider for DfContextProviderAdapter { } fn get_variable_type(&self, variable_names: &[String]) -> Option { - self.state - .df_context() - .state - .lock() - .get_variable_type(variable_names) + self.state.get_variable_type(variable_names) } } diff --git a/src/query/src/expr.rs b/src/query/src/expr.rs deleted file mode 100644 index 3a2a59181e..0000000000 --- a/src/query/src/expr.rs +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2022 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs index cee0c2727a..2e66588769 100644 --- a/src/query/src/optimizer.rs +++ b/src/query/src/optimizer.rs @@ -16,16 +16,14 @@ use std::str::FromStr; use std::sync::Arc; use common_time::timestamp::{TimeUnit, Timestamp}; -use datafusion::execution::context::ExecutionProps; -use datafusion::logical_plan::plan::Filter; -use datafusion::logical_plan::{ - Expr, ExprRewritable, ExprRewriter, ExprSchemable, LogicalPlan, Operator, TableScan, -}; use datafusion::optimizer::optimizer::OptimizerRule; -use datafusion::optimizer::utils; +use datafusion::optimizer::OptimizerConfig; use datafusion_common::{DFSchemaRef, DataFusionError, Result, ScalarValue}; +use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter}; +use datafusion_expr::{ + Between, BinaryExpr, Expr, ExprSchemable, Filter, LogicalPlan, Operator, TableScan, +}; use datatypes::arrow::compute; -use datatypes::arrow::compute::cast::CastOptions; use datatypes::arrow::datatypes::DataType; /// TypeConversionRule converts some literal values in logical plan to other types according @@ -39,24 +37,24 @@ impl OptimizerRule for TypeConversionRule { fn optimize( &self, plan: &LogicalPlan, - execution_props: &ExecutionProps, + optimizer_config: &mut OptimizerConfig, ) -> Result { let mut converter = TypeConverter { schemas: plan.all_schemas(), }; match plan { - LogicalPlan::Filter(Filter { predicate, input }) => Ok(LogicalPlan::Filter(Filter { - predicate: predicate.clone().rewrite(&mut converter)?, - input: Arc::new(self.optimize(input, execution_props)?), - })), + LogicalPlan::Filter(filter) => Ok(LogicalPlan::Filter(Filter::try_new( + filter.predicate().clone().rewrite(&mut converter)?, + Arc::new(self.optimize(filter.input(), optimizer_config)?), + )?)), LogicalPlan::TableScan(TableScan { table_name, source, projection, projected_schema, filters, - limit, + fetch, }) => { let rewrite_filters = filters .clone() @@ -69,7 +67,7 @@ impl OptimizerRule for TypeConversionRule { projection: projection.clone(), projected_schema: projected_schema.clone(), filters: rewrite_filters, - limit: *limit, + fetch: *fetch, })) } LogicalPlan::Projection { .. } @@ -86,12 +84,15 @@ impl OptimizerRule for TypeConversionRule { | LogicalPlan::CrossJoin { .. } | LogicalPlan::CreateMemoryTable { .. } | LogicalPlan::DropTable { .. } + | LogicalPlan::DropView { .. } + | LogicalPlan::Distinct { .. } | LogicalPlan::Values { .. } + | LogicalPlan::SetVariable { .. } | LogicalPlan::Analyze { .. } => { let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| self.optimize(plan, execution_props)) + .map(|plan| self.optimize(plan, optimizer_config)) .collect::>>()?; let expr = plan @@ -100,10 +101,15 @@ impl OptimizerRule for TypeConversionRule { .map(|e| e.rewrite(&mut converter)) .collect::>>()?; - utils::from_plan(plan, &expr, &new_inputs) + datafusion_expr::utils::from_plan(plan, &expr, &new_inputs) } - LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()), + LogicalPlan::Subquery { .. } + | LogicalPlan::SubqueryAlias { .. } + | LogicalPlan::CreateView { .. } + | LogicalPlan::CreateCatalogSchema { .. } + | LogicalPlan::CreateCatalog { .. } + | LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()), } } @@ -139,12 +145,11 @@ impl<'a> TypeConverter<'a> { (target_type, value) => { let value_arr = value.to_array(); let arr = - compute::cast::cast(value_arr.as_ref(), target_type, CastOptions::default()) - .map_err(DataFusionError::ArrowError)?; + compute::cast(&value_arr, target_type).map_err(DataFusionError::ArrowError)?; ScalarValue::try_from_array( - &Arc::from(arr), // index: Converts a value in `array` at `index` into a ScalarValue - 0, + &arr, + 0, // index: Converts a value in `array` at `index` into a ScalarValue ) } } @@ -188,7 +193,7 @@ impl<'a> TypeConverter<'a> { impl<'a> ExprRewriter for TypeConverter<'a> { fn mutate(&mut self, expr: Expr) -> Result { let new_expr = match expr { - Expr::BinaryExpr { left, op, right } => match op { + Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op { Operator::Eq | Operator::NotEq | Operator::Lt @@ -196,28 +201,28 @@ impl<'a> ExprRewriter for TypeConverter<'a> { | Operator::Gt | Operator::GtEq => { let (left, right) = self.convert_type(&left, &right)?; - Expr::BinaryExpr { + Expr::BinaryExpr(BinaryExpr { left: Box::new(left), op, right: Box::new(right), - } + }) } - _ => Expr::BinaryExpr { left, op, right }, + _ => Expr::BinaryExpr(BinaryExpr { left, op, right }), }, - Expr::Between { + Expr::Between(Between { expr, negated, low, high, - } => { + }) => { let (expr, low) = self.convert_type(&expr, &low)?; let (expr, high) = self.convert_type(&expr, &high)?; - Expr::Between { + Expr::Between(Between { expr: Box::new(expr), negated, low: Box::new(low), high: Box::new(high), - } + }) } Expr::InList { expr, diff --git a/src/query/src/plan.rs b/src/query/src/plan.rs index c7e337c0e9..5182db4f6a 100644 --- a/src/query/src/plan.rs +++ b/src/query/src/plan.rs @@ -14,7 +14,7 @@ use std::fmt::Debug; -use datafusion::logical_plan::LogicalPlan as DfLogicalPlan; +use datafusion_expr::LogicalPlan as DfLogicalPlan; /// A LogicalPlan represents the different types of relational /// operators (such as Projection, Filter, etc) and can be created by diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index 36bd331b36..a72b0203e3 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -19,16 +19,18 @@ use std::sync::{Arc, RwLock}; use catalog::CatalogListRef; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_function::scalars::aggregate::AggregateFunctionMetaRef; -use common_query::physical_plan::RuntimeEnv; +use common_query::physical_plan::{SessionContext, TaskContext}; use common_query::prelude::ScalarUdf; -use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate; -use datafusion::optimizer::eliminate_limit::EliminateLimit; -use datafusion::optimizer::filter_push_down::FilterPushDown; -use datafusion::optimizer::limit_push_down::LimitPushDown; -use datafusion::optimizer::projection_push_down::ProjectionPushDown; -use datafusion::optimizer::single_distinct_to_groupby::SingleDistinctToGroupBy; -use datafusion::optimizer::to_approx_perc::ToApproxPerc; -use datafusion::prelude::{ExecutionConfig, ExecutionContext}; +use datafusion::catalog::TableReference; +use datafusion::error::Result as DfResult; +use datafusion::execution::context::{SessionConfig, SessionState}; +use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion::physical_plan::udf::ScalarUDF; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_expr::{LogicalPlan as DfLogicalPlan, TableSource}; +use datafusion_optimizer::optimizer::{Optimizer, OptimizerConfig}; +use datafusion_sql::planner::ContextProvider; +use datatypes::arrow::datatypes::DataType; use crate::datafusion::DfCatalogListAdapter; use crate::optimizer::TypeConversionRule; @@ -39,7 +41,7 @@ use crate::optimizer::TypeConversionRule; // type in QueryEngine trait. #[derive(Clone)] pub struct QueryEngineState { - df_context: ExecutionContext, + df_context: SessionContext, catalog_list: CatalogListRef, aggregate_functions: Arc>>, } @@ -53,25 +55,18 @@ impl fmt::Debug for QueryEngineState { impl QueryEngineState { pub(crate) fn new(catalog_list: CatalogListRef) -> Self { - let config = ExecutionConfig::new() - .with_default_catalog_and_schema(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME) - .with_optimizer_rules(vec![ - // TODO(hl): SimplifyExpressions is not exported. - Arc::new(TypeConversionRule {}), - // These are the default optimizer in datafusion - Arc::new(CommonSubexprEliminate::new()), - Arc::new(EliminateLimit::new()), - Arc::new(ProjectionPushDown::new()), - Arc::new(FilterPushDown::new()), - Arc::new(LimitPushDown::new()), - Arc::new(SingleDistinctToGroupBy::new()), - Arc::new(ToApproxPerc::new()), - ]); + let runtime_env = Arc::new(RuntimeEnv::default()); + let session_config = SessionConfig::new() + .with_default_catalog_and_schema(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME); + let mut optimizer = Optimizer::new(&OptimizerConfig::new()); + // Apply the type conversion rule first. + optimizer.rules.insert(0, Arc::new(TypeConversionRule {})); - let df_context = ExecutionContext::with_config(config); + let mut session_state = SessionState::with_config_rt(session_config, runtime_env); + session_state.optimizer = optimizer; + session_state.catalog_list = Arc::new(DfCatalogListAdapter::new(catalog_list.clone())); - df_context.state.lock().catalog_list = - Arc::new(DfCatalogListAdapter::new(catalog_list.clone())); + let df_context = SessionContext::with_state(session_state); Self { df_context, @@ -81,11 +76,15 @@ impl QueryEngineState { } /// Register a udf function - /// TODO(dennis): manage UDFs by ourself. + // TODO(dennis): manage UDFs by ourself. pub fn register_udf(&self, udf: ScalarUdf) { + // `SessionContext` has a `register_udf()` method, which requires `&mut self`, this is + // a workaround. + // TODO(yingwen): Use `SessionContext::register_udf()` once it taks `&self`. + // It's implemented in https://github.com/apache/arrow-datafusion/pull/4612 self.df_context .state - .lock() + .write() .scalar_functions .insert(udf.name.clone(), Arc::new(udf.into_df_udf())); } @@ -113,12 +112,59 @@ impl QueryEngineState { } #[inline] - pub(crate) fn df_context(&self) -> &ExecutionContext { - &self.df_context + pub(crate) fn task_ctx(&self) -> Arc { + self.df_context.task_ctx() } - #[inline] - pub(crate) fn runtime(&self) -> Arc { - self.df_context.runtime_env() + pub(crate) fn get_table_provider( + &self, + schema: Option<&str>, + name: TableReference, + ) -> DfResult> { + let state = self.df_context.state.read(); + match name { + TableReference::Bare { table } if schema.is_some() => { + state.get_table_provider(TableReference::Partial { + // unwrap safety: checked in this match's arm + schema: schema.unwrap(), + table, + }) + } + _ => state.get_table_provider(name), + } + } + + pub(crate) fn get_function_meta(&self, name: &str) -> Option> { + let state = self.df_context.state.read(); + state.get_function_meta(name) + } + + pub(crate) fn get_variable_type(&self, variable_names: &[String]) -> Option { + let state = self.df_context.state.read(); + state.get_variable_type(variable_names) + } + + pub(crate) fn optimize(&self, plan: &DfLogicalPlan) -> DfResult { + self.df_context.optimize(plan) + } + + pub(crate) async fn create_physical_plan( + &self, + logical_plan: &DfLogicalPlan, + ) -> DfResult> { + self.df_context.create_physical_plan(logical_plan).await + } + + pub(crate) fn optimize_physical_plan( + &self, + mut plan: Arc, + ) -> DfResult> { + let state = self.df_context.state.read(); + let config = &state.config; + for optimizer in &state.physical_optimizers { + plan = optimizer.optimize(plan, config)?; + } + + Ok(plan) } } diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs index 2854fed7fc..327394416e 100644 --- a/src/query/src/sql.rs +++ b/src/query/src/sql.rs @@ -261,10 +261,9 @@ mod test { use common_query::Output; use common_recordbatch::{RecordBatch, RecordBatches}; use common_time::timestamp::TimeUnit; - use datatypes::arrow::array::PrimitiveArray; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, Schema, SchemaRef}; - use datatypes::vectors::{StringVector, TimestampVector, UInt32Vector, VectorRef}; + use datatypes::vectors::{StringVector, TimestampMillisecondVector, UInt32Vector, VectorRef}; use snafu::ResultExt; use sql::statements::describe::DescribeTable; use table::test_util::MemTable; @@ -379,12 +378,12 @@ mod test { .with_time_index(true), ]; let data = vec![ - Arc::new(UInt32Vector::from_vec(vec![0])) as _, - Arc::new(TimestampVector::new(PrimitiveArray::from_vec(vec![0]))) as _, + Arc::new(UInt32Vector::from_slice(&[0])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[0])) as _, ]; let expected_columns = vec![ Arc::new(StringVector::from(vec!["t1", "t2"])) as _, - Arc::new(StringVector::from(vec!["UInt32", "Timestamp"])) as _, + Arc::new(StringVector::from(vec!["UInt32", "TimestampMillisecond"])) as _, Arc::new(StringVector::from(vec![NULLABLE_YES, NULLABLE_NO])) as _, Arc::new(StringVector::from(vec!["", "current_timestamp()"])) as _, Arc::new(StringVector::from(vec![ diff --git a/src/query/tests/argmax_test.rs b/src/query/tests/argmax_test.rs index 11f0167a09..cbf1ae931d 100644 --- a/src/query/tests/argmax_test.rs +++ b/src/query/tests/argmax_test.rs @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; mod function; + +use std::sync::Arc; + use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use query::error::Result; use query::QueryEngine; use session::context::QueryContext; @@ -29,7 +29,7 @@ use session::context::QueryContext; #[tokio::test] async fn test_argmax_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_argmax { ([], $( { $T:ty } ),*) => { @@ -49,33 +49,23 @@ async fn test_argmax_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { let result = execute_argmax(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("argmax", result[0].schema.arrow_schema().field(0).name()); + let value = function::get_value_from_batches("argmax", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = match numbers.len() { 0 => 0_u64, _ => { let mut index = 0; - let mut max = numbers[0].into(); + let mut max = numbers[0]; for (i, &number) in numbers.iter().enumerate() { - if max < number.into() { - max = number.into(); + if max < number { + max = number; index = i; } } diff --git a/src/query/tests/argmin_test.rs b/src/query/tests/argmin_test.rs index 2a509f05fd..546fa9ae23 100644 --- a/src/query/tests/argmin_test.rs +++ b/src/query/tests/argmin_test.rs @@ -12,17 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; mod function; +use std::sync::Arc; + use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use query::error::Result; use query::QueryEngine; use session::context::QueryContext; @@ -30,7 +29,7 @@ use session::context::QueryContext; #[tokio::test] async fn test_argmin_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_argmin { ([], $( { $T:ty } ),*) => { @@ -50,33 +49,23 @@ async fn test_argmin_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + PartialOrd, - for<'a> T: Scalar = T>, + T: WrapperType + PartialOrd, { let result = execute_argmin(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("argmin", result[0].schema.arrow_schema().field(0).name()); + let value = function::get_value_from_batches("argmin", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = match numbers.len() { 0 => 0_u32, _ => { let mut index = 0; - let mut min = numbers[0].into(); + let mut min = numbers[0]; for (i, &number) in numbers.iter().enumerate() { - if min > number.into() { - min = number.into(); + if min > number { + min = number; index = i; } } diff --git a/src/query/tests/function.rs b/src/query/tests/function.rs index 040dfa7a6b..7de93a6265 100644 --- a/src/query/tests/function.rs +++ b/src/query/tests/function.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// FIXME(yingwen): Consider move all tests under query/tests to query/src so we could reuse +// more codes. use std::sync::Arc; use catalog::local::{MemoryCatalogManager, MemoryCatalogProvider, MemorySchemaProvider}; @@ -22,8 +24,8 @@ use common_recordbatch::{util, RecordBatch}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::PrimitiveElement; -use datatypes::vectors::PrimitiveVector; +use datatypes::types::WrapperType; +use datatypes::vectors::Helper; use query::query_engine::QueryEngineFactory; use query::QueryEngine; use rand::Rng; @@ -47,7 +49,7 @@ pub fn create_query_engine() -> Arc { column_schemas.push(column_schema); let numbers = (1..=10).map(|_| rng.gen::<$T>()).collect::>(); - let column: VectorRef = Arc::new(PrimitiveVector::<$T>::from_vec(numbers.to_vec())); + let column: VectorRef = Arc::new(<$T as Scalar>::VectorType::from_vec(numbers.to_vec())); columns.push(column); )* } @@ -77,8 +79,7 @@ pub async fn get_numbers_from_table<'s, T>( engine: Arc, ) -> Vec where - T: PrimitiveElement, - for<'a> T: Scalar = T>, + T: WrapperType, { let sql = format!("SELECT {} FROM {}", column_name, table_name); let plan = engine @@ -92,8 +93,21 @@ where }; let numbers = util::collect(recordbatch_stream).await.unwrap(); - let columns = numbers[0].df_recordbatch.columns(); - let column = VectorHelper::try_into_vector(&columns[0]).unwrap(); - let column: &::VectorType = unsafe { VectorHelper::static_cast(&column) }; + let column = numbers[0].column(0); + let column: &::VectorType = unsafe { Helper::static_cast(column) }; column.iter_data().flatten().collect::>() } + +pub fn get_value_from_batches(column_name: &str, batches: Vec) -> Value { + assert_eq!(1, batches.len()); + assert_eq!(batches[0].num_columns(), 1); + assert_eq!(1, batches[0].schema.num_columns()); + assert_eq!(column_name, batches[0].schema.column_schemas()[0].name); + + let batch = &batches[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), 1); + let v = batch.column(0); + assert_eq!(1, v.len()); + v.get(0) +} diff --git a/src/query/tests/mean_test.rs b/src/query/tests/mean_test.rs index 705dea797d..000323fb21 100644 --- a/src/query/tests/mean_test.rs +++ b/src/query/tests/mean_test.rs @@ -12,19 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; mod function; +use std::sync::Arc; + use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; +use datatypes::types::WrapperType; use datatypes::value::OrderedFloat; use format_num::NumberFormat; -use function::{create_query_engine, get_numbers_from_table}; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -33,7 +32,7 @@ use session::context::QueryContext; #[tokio::test] async fn test_mean_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_mean { ([], $( { $T:ty } ),*) => { @@ -53,25 +52,15 @@ async fn test_mean_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_mean(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("mean", result[0].schema.arrow_schema().field(0).name()); + let value = function::get_value_from_batches("mean", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); let expected_value = inc_stats::mean(expected_value.iter().cloned()).unwrap(); diff --git a/src/query/tests/my_sum_udaf_example.rs b/src/query/tests/my_sum_udaf_example.rs index 4e05183861..54d3a62a5b 100644 --- a/src/query/tests/my_sum_udaf_example.rs +++ b/src/query/tests/my_sum_udaf_example.rs @@ -26,12 +26,10 @@ use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::*; use common_query::Output; use common_recordbatch::{util, RecordBatch}; -use datafusion::arrow_print; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::{PrimitiveElement, PrimitiveType}; -use datatypes::vectors::PrimitiveVector; +use datatypes::types::{LogicalPrimitiveType, WrapperType}; +use datatypes::vectors::Helper; use datatypes::with_match_primitive_type_id; use num_traits::AsPrimitive; use query::error::Result; @@ -40,28 +38,30 @@ use session::context::QueryContext; use table::test_util::MemTable; #[derive(Debug, Default)] -struct MySumAccumulator -where - T: Primitive + AsPrimitive, - SumT: Primitive + std::ops::AddAssign, -{ +struct MySumAccumulator { sum: SumT, _phantom: PhantomData, } impl MySumAccumulator where - T: Primitive + AsPrimitive, - SumT: Primitive + std::ops::AddAssign, + T: WrapperType, + SumT: WrapperType, + T::Native: AsPrimitive, + SumT::Native: std::ops::AddAssign, { #[inline(always)] fn add(&mut self, v: T) { - self.sum += v.as_(); + let mut sum_native = self.sum.into_native(); + sum_native += v.into_native().as_(); + self.sum = SumT::from_native(sum_native); } #[inline(always)] fn merge(&mut self, s: SumT) { - self.sum += s; + let mut sum_native = self.sum.into_native(); + sum_native += s.into_native(); + self.sum = SumT::from_native(sum_native); } } @@ -76,7 +76,7 @@ impl AggregateFunctionCreator for MySumAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(Box::new(MySumAccumulator::<$S, <$S as Primitive>::LargestType>::default())) + Ok(Box::new(MySumAccumulator::<<$S as LogicalPrimitiveType>::Wrapper, <<$S as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>::default())) }, { let err_msg = format!( @@ -95,7 +95,7 @@ impl AggregateFunctionCreator for MySumAccumulatorCreator { with_match_primitive_type_id!( input_type.logical_type_id(), |$S| { - Ok(PrimitiveType::<<$S as Primitive>::LargestType>::default().logical_type_id().data_type()) + Ok(<<$S as LogicalPrimitiveType>::LargestType>::build_data_type()) }, { unreachable!() @@ -110,10 +110,10 @@ impl AggregateFunctionCreator for MySumAccumulatorCreator { impl Accumulator for MySumAccumulator where - T: Primitive + AsPrimitive, - for<'a> T: Scalar = T>, - SumT: Primitive + std::ops::AddAssign, - for<'a> SumT: Scalar = SumT>, + T: WrapperType, + SumT: WrapperType, + T::Native: AsPrimitive, + SumT::Native: std::ops::AddAssign, { fn state(&self) -> QueryResult> { Ok(vec![self.sum.into()]) @@ -124,7 +124,7 @@ where return Ok(()); }; let column = &values[0]; - let column: &::VectorType = unsafe { VectorHelper::static_cast(column) }; + let column: &::VectorType = unsafe { Helper::static_cast(column) }; for v in column.iter_data().flatten() { self.add(v) } @@ -136,7 +136,7 @@ where return Ok(()); }; let states = &states[0]; - let states: &::VectorType = unsafe { VectorHelper::static_cast(states) }; + let states: &::VectorType = unsafe { Helper::static_cast(states) }; for s in states.iter_data().flatten() { self.merge(s) } @@ -154,65 +154,57 @@ async fn test_my_sum() -> Result<()> { test_my_sum_with( (1..=10).collect::>(), - vec![ - "+--------+", - "| my_sum |", - "+--------+", - "| 55 |", - "+--------+", - ], + r#"+--------+ +| my_sum | ++--------+ +| 55 | ++--------+"#, ) .await?; test_my_sum_with( (-10..=11).collect::>(), - vec![ - "+--------+", - "| my_sum |", - "+--------+", - "| 11 |", - "+--------+", - ], + r#"+--------+ +| my_sum | ++--------+ +| 11 | ++--------+"#, ) .await?; test_my_sum_with( vec![-1.0f32, 1.0, 2.0, 3.0, 4.0], - vec![ - "+--------+", - "| my_sum |", - "+--------+", - "| 9 |", - "+--------+", - ], + r#"+--------+ +| my_sum | ++--------+ +| 9 | ++--------+"#, ) .await?; test_my_sum_with( vec![u32::MAX, u32::MAX], - vec![ - "+------------+", - "| my_sum |", - "+------------+", - "| 8589934590 |", - "+------------+", - ], + r#"+------------+ +| my_sum | ++------------+ +| 8589934590 | ++------------+"#, ) .await?; Ok(()) } -async fn test_my_sum_with(numbers: Vec, expected: Vec<&str>) -> Result<()> +async fn test_my_sum_with(numbers: Vec, expected: &str) -> Result<()> where - T: PrimitiveElement, + T: WrapperType, { let table_name = format!("{}_numbers", std::any::type_name::()); let column_name = format!("{}_number", std::any::type_name::()); let column_schemas = vec![ColumnSchema::new( column_name.clone(), - T::build_data_type(), + T::LogicalType::build_data_type(), true, )]; let schema = Arc::new(Schema::new(column_schemas.clone())); - let column: VectorRef = Arc::new(PrimitiveVector::::from_vec(numbers)); + let column: VectorRef = Arc::new(T::VectorType::from_vec(numbers)); let recordbatch = RecordBatch::new(schema, vec![column]).unwrap(); let testing_table = MemTable::new(&table_name, recordbatch); @@ -236,14 +228,9 @@ where Output::Stream(batch) => batch, _ => unreachable!(), }; - let recordbatch = util::collect(recordbatch_stream).await.unwrap(); - let df_recordbatch = recordbatch - .into_iter() - .map(|r| r.df_recordbatch) - .collect::>(); + let batches = util::collect_batches(recordbatch_stream).await.unwrap(); - let pretty_print = arrow_print::write(&df_recordbatch); - let pretty_print = pretty_print.lines().collect::>(); + let pretty_print = batches.pretty_print().unwrap(); assert_eq!(expected, pretty_print); Ok(()) } diff --git a/src/query/tests/percentile_test.rs b/src/query/tests/percentile_test.rs index 6e210a0494..e639d4b3e6 100644 --- a/src/query/tests/percentile_test.rs +++ b/src/query/tests/percentile_test.rs @@ -20,12 +20,10 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::PrimitiveElement; -use datatypes::vectors::PrimitiveVector; +use datatypes::vectors::Int32Vector; use function::{create_query_engine, get_numbers_from_table}; use num_traits::AsPrimitive; use query::error::Result; @@ -64,9 +62,8 @@ async fn test_percentile_correctness() -> Result<()> { _ => unreachable!(), }; let record_batch = util::collect(recordbatch_stream).await.unwrap(); - let columns = record_batch[0].df_recordbatch.columns(); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - let value = v.get(0); + let column = record_batch[0].column(0); + let value = column.get(0); assert_eq!(value, Value::from(9.280_000_000_000_001_f64)); Ok(()) } @@ -77,26 +74,12 @@ async fn test_percentile_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_percentile(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!( - "percentile", - result[0].schema.arrow_schema().field(0).name() - ); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); + let value = function::get_value_from_batches("percentile", result); let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); @@ -140,9 +123,9 @@ fn create_correctness_engine() -> Arc { let column_schema = ColumnSchema::new("corr_number", ConcreteDataType::int32_datatype(), true); column_schemas.push(column_schema); - let numbers = vec![3_i32, 6_i32, 8_i32, 10_i32]; + let numbers = [3_i32, 6_i32, 8_i32, 10_i32]; - let column: VectorRef = Arc::new(PrimitiveVector::::from_vec(numbers.to_vec())); + let column: VectorRef = Arc::new(Int32Vector::from_slice(&numbers)); columns.push(column); let schema = Arc::new(Schema::new(column_schemas)); diff --git a/src/query/tests/polyval_test.rs b/src/query/tests/polyval_test.rs index f2e60c0217..248c0d42d7 100644 --- a/src/query/tests/polyval_test.rs +++ b/src/query/tests/polyval_test.rs @@ -18,11 +18,9 @@ mod function; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -31,13 +29,13 @@ use session::context::QueryContext; #[tokio::test] async fn test_polyval_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_polyval { ([], $( { $T:ty } ),*) => { $( let column_name = format!("{}_number", std::any::type_name::<$T>()); - test_polyval_success::<$T,<$T as Primitive>::LargestType>(&column_name, "numbers", engine.clone()).await?; + test_polyval_success::<$T, <<<$T as WrapperType>::LogicalType as LogicalPrimitiveType>::LargestType as LogicalPrimitiveType>::Wrapper>(&column_name, "numbers", engine.clone()).await?; )* } } @@ -51,36 +49,27 @@ async fn test_polyval_success( engine: Arc, ) -> Result<()> where - T: Primitive + AsPrimitive + PrimitiveElement, - PolyT: Primitive + std::ops::Mul + std::iter::Sum, - for<'a> T: Scalar = T>, - for<'a> PolyT: Scalar = PolyT>, - i64: AsPrimitive, + T: WrapperType, + PolyT: WrapperType, + T::Native: AsPrimitive, + PolyT::Native: std::ops::Mul + std::iter::Sum, + i64: AsPrimitive, { let result = execute_polyval(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("polyval", result[0].schema.arrow_schema().field(0).name()); + let value = function::get_value_from_batches("polyval", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().copied(); let x = 0i64; let len = expected_value.len(); - let expected_value: PolyT = expected_value + let expected_native: PolyT::Native = expected_value .enumerate() - .map(|(i, value)| value.as_() * (x.pow((len - 1 - i) as u32)).as_()) + .map(|(i, v)| v.into_native().as_() * (x.pow((len - 1 - i) as u32)).as_()) .sum(); - assert_eq!(value, expected_value.into()); + assert_eq!(value, PolyT::from_native(expected_native).into()); Ok(()) } diff --git a/src/query/tests/pow.rs b/src/query/tests/pow.rs index 4d9006ca29..d48c28b220 100644 --- a/src/query/tests/pow.rs +++ b/src/query/tests/pow.rs @@ -32,7 +32,7 @@ pub fn pow(args: &[VectorRef]) -> Result { assert_eq!(exponent.len(), base.len()); - let v = base + let iter = base .iter_data() .zip(exponent.iter_data()) .map(|(base, exponent)| { @@ -42,8 +42,8 @@ pub fn pow(args: &[VectorRef]) -> Result { (Some(base), Some(exponent)) => Some(base.pow(exponent)), _ => None, } - }) - .collect::(); + }); + let v = UInt32Vector::from_owned_iterator(iter); Ok(Arc::new(v) as _) } diff --git a/src/query/tests/query_engine_test.rs b/src/query/tests/query_engine_test.rs index cf640afba4..05bb32a2c4 100644 --- a/src/query/tests/query_engine_test.rs +++ b/src/query/tests/query_engine_test.rs @@ -13,30 +13,28 @@ // limitations under the License. mod pow; +// This is used to suppress the warning: function `create_query_engine` is never used. +// FIXME(yingwen): We finally need to refactor these tests and move them to `query/src` +// so tests can share codes with other mods. +#[allow(unused)] +mod function; use std::sync::Arc; -use catalog::local::{MemoryCatalogManager, MemoryCatalogProvider, MemorySchemaProvider}; +use catalog::local::{MemoryCatalogProvider, MemorySchemaProvider}; use catalog::{CatalogList, CatalogProvider, SchemaProvider}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_query::prelude::{create_udf, make_scalar_function, Volatility}; use common_query::Output; -use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; -use datafusion::logical_plan::LogicalPlanBuilder; -use datatypes::arrow::array::UInt32Array; -use datatypes::for_all_primitive_types; +use datafusion::datasource::DefaultTableSource; +use datafusion_expr::logical_plan::builder::LogicalPlanBuilder; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::types::{OrdPrimitive, PrimitiveElement}; -use datatypes::vectors::{PrimitiveVector, UInt32Vector}; -use num::NumCast; +use datatypes::vectors::UInt32Vector; use query::error::Result; use query::plan::LogicalPlan; use query::query_engine::QueryEngineFactory; -use query::QueryEngine; -use rand::Rng; use session::context::QueryContext; use table::table::adapter::DfTableProviderAdapter; use table::table::numbers::NumbersTable; @@ -66,12 +64,16 @@ async fn test_datafusion_query_engine() -> Result<()> { let limit = 10; let table_provider = Arc::new(DfTableProviderAdapter::new(table.clone())); let plan = LogicalPlan::DfPlan( - LogicalPlanBuilder::scan("numbers", table_provider, None) - .unwrap() - .limit(limit) - .unwrap() - .build() - .unwrap(), + LogicalPlanBuilder::scan( + "numbers", + Arc::new(DefaultTableSource { table_provider }), + None, + ) + .unwrap() + .limit(0, Some(limit)) + .unwrap() + .build() + .unwrap(), ); let output = engine.execute(&plan).await?; @@ -84,17 +86,17 @@ async fn test_datafusion_query_engine() -> Result<()> { let numbers = util::collect(recordbatch).await.unwrap(); assert_eq!(1, numbers.len()); - assert_eq!(numbers[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, numbers[0].schema.arrow_schema().fields().len()); - assert_eq!("number", numbers[0].schema.arrow_schema().field(0).name()); + assert_eq!(numbers[0].num_columns(), 1); + assert_eq!(1, numbers[0].schema.num_columns()); + assert_eq!("number", numbers[0].schema.column_schemas()[0].name); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), limit); + let batch = &numbers[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), limit); let expected: Vec = (0u32..limit as u32).collect(); assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt32Array::from_slice(&expected) + *batch.column(0), + Arc::new(UInt32Vector::from_slice(&expected)) as VectorRef ); Ok(()) @@ -123,7 +125,8 @@ async fn test_udf() -> Result<()> { let pow = make_scalar_function(pow); let udf = create_udf( - "pow", + // datafusion already supports pow, so we use a different name. + "my_pow", vec![ ConcreteDataType::uint32_datatype(), ConcreteDataType::uint32_datatype(), @@ -136,7 +139,7 @@ async fn test_udf() -> Result<()> { engine.register_udf(udf); let plan = engine.sql_to_plan( - "select pow(number, number) as p from numbers limit 10", + "select my_pow(number, number) as p from numbers limit 10", Arc::new(QueryContext::new()), )?; @@ -148,202 +151,18 @@ async fn test_udf() -> Result<()> { let numbers = util::collect(recordbatch).await.unwrap(); assert_eq!(1, numbers.len()); - assert_eq!(numbers[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, numbers[0].schema.arrow_schema().fields().len()); - assert_eq!("p", numbers[0].schema.arrow_schema().field(0).name()); + assert_eq!(numbers[0].num_columns(), 1); + assert_eq!(1, numbers[0].schema.num_columns()); + assert_eq!("p", numbers[0].schema.column_schemas()[0].name); - let columns = numbers[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 10); + let batch = &numbers[0]; + assert_eq!(1, batch.num_columns()); + assert_eq!(batch.column(0).len(), 10); let expected: Vec = vec![1, 1, 4, 27, 256, 3125, 46656, 823543, 16777216, 387420489]; assert_eq!( - *columns[0].as_any().downcast_ref::().unwrap(), - UInt32Array::from_slice(&expected) + *batch.column(0), + Arc::new(UInt32Vector::from_slice(&expected)) as VectorRef ); Ok(()) } - -fn create_query_engine() -> Arc { - let schema_provider = Arc::new(MemorySchemaProvider::new()); - let catalog_provider = Arc::new(MemoryCatalogProvider::new()); - let catalog_list = Arc::new(MemoryCatalogManager::default()); - - // create table with primitives, and all columns' length are even - let mut column_schemas = vec![]; - let mut columns = vec![]; - macro_rules! create_even_number_table { - ([], $( { $T:ty } ),*) => { - $( - let mut rng = rand::thread_rng(); - - let column_name = format!("{}_number_even", std::any::type_name::<$T>()); - let column_schema = ColumnSchema::new(column_name, Value::from(<$T>::default()).data_type(), true); - column_schemas.push(column_schema); - - let numbers = (1..=100).map(|_| rng.gen::<$T>()).collect::>(); - let column: VectorRef = Arc::new(PrimitiveVector::<$T>::from_vec(numbers.to_vec())); - columns.push(column); - )* - } - } - for_all_primitive_types! { create_even_number_table } - - let schema = Arc::new(Schema::new(column_schemas.clone())); - let recordbatch = RecordBatch::new(schema, columns).unwrap(); - let even_number_table = Arc::new(MemTable::new("even_numbers", recordbatch)); - schema_provider - .register_table( - even_number_table.table_name().to_string(), - even_number_table, - ) - .unwrap(); - - // create table with primitives, and all columns' length are odd - let mut column_schemas = vec![]; - let mut columns = vec![]; - macro_rules! create_odd_number_table { - ([], $( { $T:ty } ),*) => { - $( - let mut rng = rand::thread_rng(); - - let column_name = format!("{}_number_odd", std::any::type_name::<$T>()); - let column_schema = ColumnSchema::new(column_name, Value::from(<$T>::default()).data_type(), true); - column_schemas.push(column_schema); - - let numbers = (1..=99).map(|_| rng.gen::<$T>()).collect::>(); - let column: VectorRef = Arc::new(PrimitiveVector::<$T>::from_vec(numbers.to_vec())); - columns.push(column); - )* - } - } - for_all_primitive_types! { create_odd_number_table } - - let schema = Arc::new(Schema::new(column_schemas.clone())); - let recordbatch = RecordBatch::new(schema, columns).unwrap(); - let odd_number_table = Arc::new(MemTable::new("odd_numbers", recordbatch)); - schema_provider - .register_table(odd_number_table.table_name().to_string(), odd_number_table) - .unwrap(); - - catalog_provider - .register_schema(DEFAULT_SCHEMA_NAME.to_string(), schema_provider) - .unwrap(); - catalog_list - .register_catalog(DEFAULT_CATALOG_NAME.to_string(), catalog_provider) - .unwrap(); - - QueryEngineFactory::new(catalog_list).query_engine() -} - -async fn get_numbers_from_table<'s, T>( - column_name: &'s str, - table_name: &'s str, - engine: Arc, -) -> Vec> -where - T: PrimitiveElement, - for<'a> T: Scalar = T>, -{ - let sql = format!("SELECT {} FROM {}", column_name, table_name); - let plan = engine - .sql_to_plan(&sql, Arc::new(QueryContext::new())) - .unwrap(); - - let output = engine.execute(&plan).await.unwrap(); - let recordbatch_stream = match output { - Output::Stream(batch) => batch, - _ => unreachable!(), - }; - let numbers = util::collect(recordbatch_stream).await.unwrap(); - - let columns = numbers[0].df_recordbatch.columns(); - let column = VectorHelper::try_into_vector(&columns[0]).unwrap(); - let column: &::VectorType = unsafe { VectorHelper::static_cast(&column) }; - column - .iter_data() - .flatten() - .map(|x| OrdPrimitive::(x)) - .collect::>>() -} - -#[tokio::test] -async fn test_median_aggregator() -> Result<()> { - common_telemetry::init_default_ut_logging(); - - let engine = create_query_engine(); - - macro_rules! test_median { - ([], $( { $T:ty } ),*) => { - $( - let column_name = format!("{}_number_even", std::any::type_name::<$T>()); - test_median_success::<$T>(&column_name, "even_numbers", engine.clone()).await?; - - let column_name = format!("{}_number_odd", std::any::type_name::<$T>()); - test_median_success::<$T>(&column_name, "odd_numbers", engine.clone()).await?; - )* - } - } - for_all_primitive_types! { test_median } - Ok(()) -} - -async fn test_median_success( - column_name: &str, - table_name: &str, - engine: Arc, -) -> Result<()> -where - T: PrimitiveElement, - for<'a> T: Scalar = T>, -{ - let result = execute_median(column_name, table_name, engine.clone()) - .await - .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!("median", result[0].schema.arrow_schema().field(0).name()); - - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let median = v.get(0); - - let mut numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; - numbers.sort(); - let len = numbers.len(); - let expected_median: Value = if len % 2 == 1 { - numbers[len / 2] - } else { - let a: f64 = NumCast::from(numbers[len / 2 - 1].as_primitive()).unwrap(); - let b: f64 = NumCast::from(numbers[len / 2].as_primitive()).unwrap(); - OrdPrimitive::(NumCast::from(a / 2.0 + b / 2.0).unwrap()) - } - .into(); - assert_eq!(expected_median, median); - Ok(()) -} - -async fn execute_median<'a>( - column_name: &'a str, - table_name: &'a str, - engine: Arc, -) -> RecordResult> { - let sql = format!( - "select MEDIAN({}) as median from {}", - column_name, table_name - ); - let plan = engine - .sql_to_plan(&sql, Arc::new(QueryContext::new())) - .unwrap(); - - let output = engine.execute(&plan).await.unwrap(); - let recordbatch_stream = match output { - Output::Stream(batch) => batch, - _ => unreachable!(), - }; - util::collect(recordbatch_stream).await -} diff --git a/src/query/tests/scipy_stats_norm_cdf_test.rs b/src/query/tests/scipy_stats_norm_cdf_test.rs index 815501a314..dee8f5c87e 100644 --- a/src/query/tests/scipy_stats_norm_cdf_test.rs +++ b/src/query/tests/scipy_stats_norm_cdf_test.rs @@ -18,11 +18,8 @@ mod function; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; -use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -33,7 +30,7 @@ use statrs::statistics::Statistics; #[tokio::test] async fn test_scipy_stats_norm_cdf_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_scipy_stats_norm_cdf { ([], $( { $T:ty } ),*) => { @@ -53,28 +50,15 @@ async fn test_scipy_stats_norm_cdf_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_scipy_stats_norm_cdf(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!( - "scipy_stats_norm_cdf", - result[0].schema.arrow_schema().field(0).name() - ); + let value = function::get_value_from_batches("scipy_stats_norm_cdf", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); let mean = expected_value.clone().mean(); let stddev = expected_value.std_dev(); diff --git a/src/query/tests/scipy_stats_norm_pdf.rs b/src/query/tests/scipy_stats_norm_pdf.rs index dd5e0fc7fc..03e4cf1292 100644 --- a/src/query/tests/scipy_stats_norm_pdf.rs +++ b/src/query/tests/scipy_stats_norm_pdf.rs @@ -18,11 +18,8 @@ mod function; use common_query::Output; use common_recordbatch::error::Result as RecordResult; use common_recordbatch::{util, RecordBatch}; -use datafusion::field_util::{FieldExt, SchemaExt}; use datatypes::for_all_primitive_types; -use datatypes::prelude::*; -use datatypes::types::PrimitiveElement; -use function::{create_query_engine, get_numbers_from_table}; +use datatypes::types::WrapperType; use num_traits::AsPrimitive; use query::error::Result; use query::QueryEngine; @@ -33,7 +30,7 @@ use statrs::statistics::Statistics; #[tokio::test] async fn test_scipy_stats_norm_pdf_aggregator() -> Result<()> { common_telemetry::init_default_ut_logging(); - let engine = create_query_engine(); + let engine = function::create_query_engine(); macro_rules! test_scipy_stats_norm_pdf { ([], $( { $T:ty } ),*) => { @@ -53,28 +50,15 @@ async fn test_scipy_stats_norm_pdf_success( engine: Arc, ) -> Result<()> where - T: PrimitiveElement + AsPrimitive, - for<'a> T: Scalar = T>, + T: WrapperType + AsPrimitive, { let result = execute_scipy_stats_norm_pdf(column_name, table_name, engine.clone()) .await .unwrap(); - assert_eq!(1, result.len()); - assert_eq!(result[0].df_recordbatch.num_columns(), 1); - assert_eq!(1, result[0].schema.arrow_schema().fields().len()); - assert_eq!( - "scipy_stats_norm_pdf", - result[0].schema.arrow_schema().field(0).name() - ); + let value = function::get_value_from_batches("scipy_stats_norm_pdf", result); - let columns = result[0].df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(columns[0].len(), 1); - let v = VectorHelper::try_into_vector(&columns[0]).unwrap(); - assert_eq!(1, v.len()); - let value = v.get(0); - - let numbers = get_numbers_from_table::(column_name, table_name, engine.clone()).await; + let numbers = + function::get_numbers_from_table::(column_name, table_name, engine.clone()).await; let expected_value = numbers.iter().map(|&n| n.as_()).collect::>(); let mean = expected_value.clone().mean(); let stddev = expected_value.std_dev(); diff --git a/src/script/Cargo.toml b/src/script/Cargo.toml index 43206c3ba5..3c36632647 100644 --- a/src/script/Cargo.toml +++ b/src/script/Cargo.toml @@ -8,6 +8,7 @@ license = "Apache-2.0" default = ["python"] python = [ "dep:datafusion", + "dep:datafusion-common", "dep:datafusion-expr", "dep:datafusion-physical-expr", "dep:rustpython-vm", @@ -32,10 +33,10 @@ common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } common-time = { path = "../common/time" } console = "0.15" -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", optional = true } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", optional = true } -datafusion-physical-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", optional = true } +datafusion = { version = "14.0.0", optional = true } +datafusion-common = { version = "14.0.0", optional = true } +datafusion-expr = { version = "14.0.0", optional = true } +datafusion-physical-expr = { version = "14.0.0", optional = true } datatypes = { path = "../datatypes" } futures = "0.3" futures-util = "0.3" diff --git a/src/script/src/python/builtins/mod.rs b/src/script/src/python/builtins/mod.rs index 679d91289b..4cd52cc609 100644 --- a/src/script/src/python/builtins/mod.rs +++ b/src/script/src/python/builtins/mod.rs @@ -20,10 +20,9 @@ mod test; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::ColumnarValue as DFColValue; use datafusion_physical_expr::AggregateExpr; -use datatypes::arrow; use datatypes::arrow::array::ArrayRef; -use datatypes::arrow::compute::cast::CastOptions; -use datatypes::arrow::datatypes::DataType; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field}; use datatypes::vectors::Helper as HelperVec; use rustpython_vm::builtins::{PyBaseExceptionRef, PyBool, PyFloat, PyInt, PyList, PyStr}; use rustpython_vm::{pymodule, AsObject, PyObjectRef, PyPayload, PyResult, VirtualMachine}; @@ -36,7 +35,7 @@ fn type_cast_error(name: &str, ty: &str, vm: &VirtualMachine) -> PyBaseException vm.new_type_error(format!("Can't cast operand of type `{name}` into `{ty}`.")) } -fn collect_diff_types_string(values: &[ScalarValue], ty: &DataType) -> String { +fn collect_diff_types_string(values: &[ScalarValue], ty: &ArrowDataType) -> String { values .iter() .enumerate() @@ -55,6 +54,10 @@ fn collect_diff_types_string(values: &[ScalarValue], ty: &DataType) -> String { .unwrap_or_else(|| "Nothing".to_string()) } +fn new_item_field(data_type: ArrowDataType) -> Field { + Field::new("item", data_type, false) +} + /// try to turn a Python Object into a PyVector or a scalar that can be use for calculate /// /// supported scalar are(leftside is python data type, right side is rust type): @@ -108,7 +111,7 @@ pub fn try_into_columnar_value(obj: PyObjectRef, vm: &VirtualMachine) -> PyResul // TODO(dennis): empty list, we set type as null. return Ok(DFColValue::Scalar(ScalarValue::List( None, - Box::new(DataType::Null), + Box::new(new_item_field(ArrowDataType::Null)), ))); } @@ -120,8 +123,8 @@ pub fn try_into_columnar_value(obj: PyObjectRef, vm: &VirtualMachine) -> PyResul ))); } Ok(DFColValue::Scalar(ScalarValue::List( - Some(Box::new(ret)), - Box::new(ty), + Some(ret), + Box::new(new_item_field(ty)), ))) } else { Err(vm.new_type_error(format!( @@ -183,22 +186,14 @@ fn scalar_val_try_into_py_obj(val: ScalarValue, vm: &VirtualMachine) -> PyResult fn all_to_f64(col: DFColValue, vm: &VirtualMachine) -> PyResult { match col { DFColValue::Array(arr) => { - let res = arrow::compute::cast::cast( - arr.as_ref(), - &DataType::Float64, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|err| { + let res = compute::cast(&arr, &ArrowDataType::Float64).map_err(|err| { vm.new_type_error(format!( "Arrow Type Cast Fail(from {:#?} to {:#?}): {err:#?}", arr.data_type(), - DataType::Float64 + ArrowDataType::Float64 )) })?; - Ok(DFColValue::Array(res.into())) + Ok(DFColValue::Array(res)) } DFColValue::Scalar(val) => { let val_in_f64 = match val { @@ -209,7 +204,7 @@ fn all_to_f64(col: DFColValue, vm: &VirtualMachine) -> PyResult { return Err(vm.new_type_error(format!( "Can't cast type {:#?} to {:#?}", val.get_datatype(), - DataType::Float64 + ArrowDataType::Float64 ))) } }; @@ -283,17 +278,16 @@ pub(crate) mod greptime_builtin { // P.S.: not extract to file because not-inlined proc macro attribute is *unstable* use std::sync::Arc; + use arrow::compute::kernels::{aggregate, boolean, comparison}; use common_function::scalars::function::FunctionContext; use common_function::scalars::math::PowFunction; use common_function::scalars::{Function, FunctionRef, FUNCTION_REGISTRY}; - use datafusion::arrow::compute::comparison::{gt_eq_scalar, lt_eq_scalar}; - use datafusion::arrow::datatypes::DataType; - use datafusion::arrow::error::ArrowError; - use datafusion::arrow::scalar::{PrimitiveScalar, Scalar}; + use datafusion::arrow::datatypes::DataType as ArrowDataType; use datafusion::physical_plan::expressions; use datafusion_expr::ColumnarValue as DFColValue; use datafusion_physical_expr::math_expressions; - use datatypes::arrow::array::{ArrayRef, NullArray}; + use datatypes::arrow::array::{ArrayRef, Int64Array, NullArray}; + use datatypes::arrow::error::ArrowError; use datatypes::arrow::{self, compute}; use datatypes::vectors::{ConstantVector, Float64Vector, Helper, Int64Vector, VectorRef}; use paste::paste; @@ -386,11 +380,6 @@ pub(crate) mod greptime_builtin { eval_func("clip", &[v0, v1, v2], vm) } - #[pyfunction] - fn median(v: PyVectorRef, vm: &VirtualMachine) -> PyResult { - eval_aggr_func("median", &[v], vm) - } - #[pyfunction] fn diff(v: PyVectorRef, vm: &VirtualMachine) -> PyResult { eval_aggr_func("diff", &[v], vm) @@ -552,7 +541,7 @@ pub(crate) mod greptime_builtin { fn random(len: usize, vm: &VirtualMachine) -> PyResult { // This is in a proc macro so using full path to avoid strange things // more info at: https://doc.rust-lang.org/reference/procedural-macros.html#procedural-macro-hygiene - let arg = NullArray::new(arrow::datatypes::DataType::Null, len); + let arg = NullArray::new(len); let args = &[DFColValue::Array(std::sync::Arc::new(arg) as _)]; let res = math_expressions::random(args).map_err(|err| from_df_err(err, vm))?; let ret = try_into_py_obj(res, vm)?; @@ -571,6 +560,17 @@ pub(crate) mod greptime_builtin { ); } + #[pyfunction] + fn median(values: PyVectorRef, vm: &VirtualMachine) -> PyResult { + bind_aggr_fn!( + Median, + vm, + &[values.to_arrow_array()], + values.to_arrow_array().data_type(), + expr0 + ); + } + /// Not implement in datafusion /// TODO(discord9): use greptime's own impl instead /* @@ -807,12 +807,16 @@ pub(crate) mod greptime_builtin { Ok(res.into()) } - fn gen_none_array(data_type: DataType, len: usize, vm: &VirtualMachine) -> PyResult { + fn gen_none_array( + data_type: ArrowDataType, + len: usize, + vm: &VirtualMachine, + ) -> PyResult { macro_rules! match_none_array { ($VAR:ident, $LEN: ident, [$($TY:ident),*]) => { paste!{ match $VAR{ - $(DataType::$TY => Arc::new(arrow::array::[<$TY Array>]::from(vec![None;$LEN])), )* + $(ArrowDataType::$TY => Arc::new(arrow::array::[<$TY Array>]::from(vec![None;$LEN])), )* _ => return Err(vm.new_type_error(format!("gen_none_array() does not support {:?}", data_type))) } } @@ -828,10 +832,10 @@ pub(crate) mod greptime_builtin { #[pyfunction] fn prev(cur: PyVectorRef, vm: &VirtualMachine) -> PyResult { - let cur: ArrayRef = cur.to_arrow_array(); + let cur = cur.to_arrow_array(); if cur.len() == 0 { let ret = cur.slice(0, 0); - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -841,10 +845,10 @@ pub(crate) mod greptime_builtin { } let cur = cur.slice(0, cur.len() - 1); // except the last one that is let fill = gen_none_array(cur.data_type().to_owned(), 1, vm)?; - let ret = compute::concatenate::concatenate(&[&*fill, &*cur]).map_err(|err| { + let ret = compute::concat(&[&*fill, &*cur]).map_err(|err| { vm.new_runtime_error(format!("Can't concat array[0] with array[0:-1]!{err:#?}")) })?; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -855,10 +859,10 @@ pub(crate) mod greptime_builtin { #[pyfunction] fn next(cur: PyVectorRef, vm: &VirtualMachine) -> PyResult { - let cur: ArrayRef = cur.to_arrow_array(); + let cur = cur.to_arrow_array(); if cur.len() == 0 { let ret = cur.slice(0, 0); - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -868,10 +872,10 @@ pub(crate) mod greptime_builtin { } let cur = cur.slice(1, cur.len() - 1); // except the last one that is let fill = gen_none_array(cur.data_type().to_owned(), 1, vm)?; - let ret = compute::concatenate::concatenate(&[&*cur, &*fill]).map_err(|err| { + let ret = compute::concat(&[&*cur, &*fill]).map_err(|err| { vm.new_runtime_error(format!("Can't concat array[0] with array[0:-1]!{err:#?}")) })?; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -880,55 +884,24 @@ pub(crate) mod greptime_builtin { Ok(ret.into()) } - fn try_scalar_to_value(scalar: &dyn Scalar, vm: &VirtualMachine) -> PyResult { - let ty_error = |s: String| vm.new_type_error(s); - scalar - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ty_error(format!( - "expect scalar to be i64, found{:?}", - scalar.data_type() - )) - })? - .value() - .ok_or_else(|| ty_error("All element is Null in a time series array".to_string())) - } - /// generate interval time point fn gen_inteveral( - oldest: &dyn Scalar, - newest: &dyn Scalar, + oldest: i64, + newest: i64, duration: i64, vm: &VirtualMachine, - ) -> PyResult>> { - use datatypes::arrow::datatypes::DataType; - match (oldest.data_type(), newest.data_type()) { - (DataType::Int64, DataType::Int64) => (), - _ => { - return Err(vm.new_type_error(format!( - "Expect int64, found {:?} and {:?}", - oldest.data_type(), - newest.data_type() - ))); - } - } - - let oldest = try_scalar_to_value(oldest, vm)?; - let newest = try_scalar_to_value(newest, vm)?; + ) -> PyResult> { if oldest > newest { return Err(vm.new_value_error(format!("{oldest} is greater than {newest}"))); } - let ret = if duration > 0 { - (oldest..=newest) + if duration > 0 { + let ret = (oldest..=newest) .step_by(duration as usize) - .map(|v| PrimitiveScalar::new(DataType::Int64, Some(v))) - .collect::>() + .collect::>(); + Ok(ret) } else { - return Err(vm.new_value_error(format!("duration: {duration} is not positive number."))); - }; - - Ok(ret) + Err(vm.new_value_error(format!("duration: {duration} is not positive number."))) + } } /// `func`: exec on sliding window slice of given `arr`, expect it to always return a PyVector of one element @@ -951,12 +924,19 @@ pub(crate) mod greptime_builtin { let arrow_error = |err: ArrowError| vm.new_runtime_error(format!("Arrow Error: {err:#?}")); let datatype_error = |err: datatypes::Error| vm.new_runtime_error(format!("DataType Errors!: {err:#?}")); - let ts: ArrayRef = ts.to_arrow_array(); - let arr: ArrayRef = arr.to_arrow_array(); + let ts_array_ref: ArrayRef = ts.to_arrow_array(); + let ts = ts_array_ref + .as_any() + .downcast_ref::() + .ok_or_else(|| { + vm.new_type_error(format!("ts must be int64, found: {:?}", ts_array_ref)) + })?; let slices = { - let oldest = compute::aggregate::min(&*ts).map_err(arrow_error)?; - let newest = compute::aggregate::max(&*ts).map_err(arrow_error)?; - gen_inteveral(&*oldest, &*newest, duration, vm)? + let oldest = aggregate::min(ts) + .ok_or_else(|| vm.new_runtime_error("ts must has min value".to_string()))?; + let newest = aggregate::max(ts) + .ok_or_else(|| vm.new_runtime_error("ts must has max value".to_string()))?; + gen_inteveral(oldest, newest, duration, vm)? }; let windows = { @@ -968,11 +948,15 @@ pub(crate) mod greptime_builtin { it }) .map(|(first, second)| { - compute::boolean::and(>_eq_scalar(&*ts, first), <_eq_scalar(&*ts, second)) - .map_err(arrow_error) + let left = comparison::gt_eq_scalar(ts, *first).map_err(arrow_error)?; + let right = comparison::lt_eq_scalar(ts, *second).map_err(arrow_error)?; + boolean::and(&left, &right).map_err(arrow_error) }) .map(|mask| match mask { - Ok(mask) => compute::filter::filter(&*arr, &mask).map_err(arrow_error), + Ok(mask) => { + let arrow_arr = arr.to_arrow_array(); + compute::filter(&arrow_arr, &mask).map_err(arrow_error) + } Err(e) => Err(e), }) .collect::, _>>()? @@ -1012,16 +996,17 @@ pub(crate) mod greptime_builtin { .map(apply_interval_function) .collect::, _>>()?; - // 3. get returen vector and concat them - let ret = fn_results - .into_iter() - .try_reduce(|acc, x| { - compute::concatenate::concatenate(&[acc.as_ref(), x.as_ref()]).map(Arc::from) - }) - .map_err(arrow_error)? - .unwrap_or_else(|| Arc::from(arr.slice(0, 0))); + // 3. get returned vector and concat them + let result_arrays: Vec<_> = fn_results + .iter() + .map(|vector| vector.to_arrow_array()) + .collect(); + let result_dyn_arrays: Vec<_> = result_arrays.iter().map(|v| v.as_ref()).collect(); + let concat_array = compute::concat(&result_dyn_arrays).map_err(arrow_error)?; + let vector = Helper::try_into_vector(concat_array).map_err(datatype_error)?; + // 4. return result vector - Ok(Helper::try_into_vector(ret).map_err(datatype_error)?.into()) + Ok(PyVector::from(vector)) } /// return first element in a `PyVector` in sliced new `PyVector`, if vector's length is zero, return a zero sized slice instead @@ -1032,7 +1017,7 @@ pub(crate) mod greptime_builtin { 0 => arr.slice(0, 0), _ => arr.slice(0, 1), }; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e @@ -1049,7 +1034,7 @@ pub(crate) mod greptime_builtin { 0 => arr.slice(0, 0), _ => arr.slice(arr.len() - 1, 1), }; - let ret = Helper::try_into_vector(&*ret).map_err(|e| { + let ret = Helper::try_into_vector(ret.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", ret, e diff --git a/src/script/src/python/builtins/test.rs b/src/script/src/python/builtins/test.rs index 39caf399e2..16828ba883 100644 --- a/src/script/src/python/builtins/test.rs +++ b/src/script/src/python/builtins/test.rs @@ -19,10 +19,10 @@ use std::path::Path; use std::sync::Arc; use common_telemetry::{error, info}; -use datatypes::arrow::array::{Float64Array, Int64Array, PrimitiveArray}; -use datatypes::arrow::compute::cast::CastOptions; -use datatypes::arrow::datatypes::DataType; -use datatypes::vectors::VectorRef; +use datatypes::arrow::array::{Float64Array, Int64Array}; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field}; +use datatypes::vectors::{Float64Vector, Int64Vector, VectorRef}; use ron::from_str as from_ron_string; use rustpython_vm::builtins::{PyFloat, PyInt, PyList}; use rustpython_vm::class::PyClassImpl; @@ -69,17 +69,17 @@ fn convert_scalar_to_py_obj_and_back() { panic!("Convert errors, expect 1") } let col = DFColValue::Scalar(ScalarValue::List( - Some(Box::new(vec![ + Some(vec![ ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2)), - ])), - Box::new(DataType::Int64), + ]), + Box::new(Field::new("item", ArrowDataType::Int64, false)), )); let to = try_into_py_obj(col, vm).unwrap(); let back = try_into_columnar_value(to, vm).unwrap(); - if let DFColValue::Scalar(ScalarValue::List(Some(list), ty)) = back { + if let DFColValue::Scalar(ScalarValue::List(Some(list), field)) = back { assert_eq!(list.len(), 2); - assert_eq!(ty.as_ref(), &DataType::Int64); + assert_eq!(*field.data_type(), ArrowDataType::Int64); } let list: Vec = vec![vm.ctx.new_int(1).into(), vm.ctx.new_int(2).into()]; let nested_list: Vec = @@ -93,12 +93,10 @@ fn convert_scalar_to_py_obj_and_back() { )); } - let list: PyVector = PyVector::from( - HelperVec::try_into_vector( - Arc::new(PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4])) as ArrayRef, - ) - .unwrap(), - ); + let list: PyVector = + PyVector::from( + Arc::new(Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4])) as VectorRef + ); let nested_list: Vec = vec![list.into_pyobject(vm), vm.ctx.new_int(3).into()]; let list_obj = vm.ctx.new_list(nested_list).into(); let expect_err = try_into_columnar_value(list_obj, vm); @@ -116,7 +114,7 @@ struct TestCase { #[derive(Debug, Serialize, Deserialize)] struct Var { value: PyValue, - ty: DataType, + ty: ArrowDataType, } /// for floating number comparison @@ -190,25 +188,25 @@ impl PyValue { } } -fn is_float(ty: &DataType) -> bool { +fn is_float(ty: &ArrowDataType) -> bool { matches!( ty, - DataType::Float16 | DataType::Float32 | DataType::Float64 + ArrowDataType::Float16 | ArrowDataType::Float32 | ArrowDataType::Float64 ) } /// unsigned included -fn is_int(ty: &DataType) -> bool { +fn is_int(ty: &ArrowDataType) -> bool { matches!( ty, - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 + ArrowDataType::UInt8 + | ArrowDataType::UInt16 + | ArrowDataType::UInt32 + | ArrowDataType::UInt64 + | ArrowDataType::Int8 + | ArrowDataType::Int16 + | ArrowDataType::Int32 + | ArrowDataType::Int64 ) } @@ -218,7 +216,7 @@ impl PyValue { PyValue::FloatVec(v) => { Arc::new(datatypes::vectors::Float64Vector::from_vec(v.clone())) } - PyValue::IntVec(v) => Arc::new(datatypes::vectors::Int64Vector::from_vec(v.clone())), + PyValue::IntVec(v) => Arc::new(Int64Vector::from_vec(v.clone())), PyValue::Int(v) => return Ok(vm.ctx.new_int(*v).into()), PyValue::Float(v) => return Ok(vm.ctx.new_float(*v).into()), Self::Bool(v) => return Ok(vm.ctx.new_bool(*v).into()), @@ -235,16 +233,9 @@ impl PyValue { let res = res.to_arrow_array(); let ty = res.data_type(); if is_float(ty) { - let vec_f64 = arrow::compute::cast::cast( - res.as_ref(), - &DataType::Float64, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|err| format!("{err:#?}"))?; - assert_eq!(vec_f64.data_type(), &DataType::Float64); + let vec_f64 = compute::cast(&res, &ArrowDataType::Float64) + .map_err(|err| format!("{err:#?}"))?; + assert_eq!(vec_f64.data_type(), &ArrowDataType::Float64); let vec_f64 = vec_f64 .as_any() .downcast_ref::() @@ -252,13 +243,6 @@ impl PyValue { let ret = vec_f64 .into_iter() .map(|v| v.map(|inner| inner.to_owned())) - /* .enumerate() - .map(|(idx, v)| { - v.ok_or(format!( - "No null element expected, found one in {idx} position" - )) - .map(|v| v.to_owned()) - })*/ .collect::>(); if ret.iter().all(|x| x.is_some()) { Ok(Self::FloatVec( @@ -268,16 +252,9 @@ impl PyValue { Ok(Self::FloatVecWithNull(ret)) } } else if is_int(ty) { - let vec_int = arrow::compute::cast::cast( - res.as_ref(), - &DataType::Int64, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|err| format!("{err:#?}"))?; - assert_eq!(vec_int.data_type(), &DataType::Int64); + let vec_int = compute::cast(&res, &ArrowDataType::Int64) + .map_err(|err| format!("{err:#?}"))?; + assert_eq!(vec_int.data_type(), &ArrowDataType::Int64); let vec_i64 = vec_int .as_any() .downcast_ref::() @@ -294,7 +271,7 @@ impl PyValue { .collect::>()?; Ok(Self::IntVec(ret)) } else { - Err(format!("unspupported DataType:{ty:#?}")) + Err(format!("unspupported ArrowDataType:{ty:#?}")) } } else if is_instance::(obj, vm) { let res = obj diff --git a/src/script/src/python/coprocessor.rs b/src/script/src/python/coprocessor.rs index 3bc5c39f2a..3dcc348562 100644 --- a/src/script/src/python/coprocessor.rs +++ b/src/script/src/python/coprocessor.rs @@ -16,19 +16,18 @@ pub mod compile; pub mod parse; use std::cell::RefCell; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::result::Result as StdResult; use std::sync::Arc; use common_recordbatch::RecordBatch; use common_telemetry::info; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow; -use datatypes::arrow::array::{Array, ArrayRef}; -use datatypes::arrow::compute::cast::CastOptions; -use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; -use datatypes::schema::Schema; -use datatypes::vectors::{BooleanVector, Helper, StringVector, Vector, VectorRef}; +use datatypes::arrow::array::Array; +use datatypes::arrow::compute; +use datatypes::arrow::datatypes::DataType as ArrowDataType; +use datatypes::data_type::{ConcreteDataType, DataType}; +use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; +use datatypes::vectors::{Helper, VectorRef}; use rustpython_compiler_core::CodeObject; use rustpython_vm as vm; use rustpython_vm::class::PyClassImpl; @@ -43,7 +42,8 @@ use vm::{Interpreter, PyObjectRef, VirtualMachine}; use crate::python::builtins::greptime_builtin; use crate::python::coprocessor::parse::DecoratorArgs; use crate::python::error::{ - ensure, ret_other_error_with, ArrowSnafu, OtherSnafu, Result, TypeCastSnafu, + ensure, ret_other_error_with, ArrowSnafu, NewRecordBatchSnafu, OtherSnafu, Result, + TypeCastSnafu, }; use crate::python::utils::{format_py_error, is_instance, py_vec_obj_to_array}; use crate::python::PyVector; @@ -54,7 +54,8 @@ thread_local!(static INTERPRETER: RefCell>> = RefCell::n #[derive(Debug, Clone, PartialEq, Eq)] pub struct AnnotationInfo { /// if None, use types inferred by PyVector - pub datatype: Option, + // TODO(yingwen): We should use our data type. i.e. ConcreteDataType. + pub datatype: Option, pub is_nullable: bool, } @@ -95,7 +96,7 @@ impl Coprocessor { /// generate [`Schema`] according to return names, types, /// if no annotation /// the datatypes of the actual columns is used directly - fn gen_schema(&self, cols: &[ArrayRef]) -> Result> { + fn gen_schema(&self, cols: &[VectorRef]) -> Result { let names = &self.deco_args.ret_names; let anno = &self.return_types; ensure!( @@ -109,35 +110,38 @@ impl Coprocessor { ) } ); - Ok(Arc::new(ArrowSchema::from( - names - .iter() - .enumerate() - .map(|(idx, name)| { - let real_ty = cols[idx].data_type().to_owned(); - let AnnotationInfo { - datatype: ty, - is_nullable, - } = anno[idx].to_owned().unwrap_or_else(|| { - // default to be not nullable and use DataType inferred by PyVector itself - AnnotationInfo { - datatype: Some(real_ty.to_owned()), - is_nullable: false, - } - }); - Field::new( - name, - // if type is like `_` or `_ | None` - ty.unwrap_or(real_ty), - is_nullable, - ) - }) - .collect::>(), - ))) + + let column_schemas = names + .iter() + .enumerate() + .map(|(idx, name)| { + let real_ty = cols[idx].data_type(); + let AnnotationInfo { + datatype: ty, + is_nullable, + } = anno[idx].to_owned().unwrap_or_else(|| { + // default to be not nullable and use DataType inferred by PyVector itself + AnnotationInfo { + datatype: Some(real_ty.as_arrow_type()), + is_nullable: false, + } + }); + let column_type = match ty { + Some(arrow_type) => { + ConcreteDataType::try_from(&arrow_type).context(TypeCastSnafu)? + } + // if type is like `_` or `_ | None` + None => real_ty, + }; + Ok(ColumnSchema::new(name, column_type, is_nullable)) + }) + .collect::>>()?; + + Ok(Arc::new(Schema::new(column_schemas))) } /// check if real types and annotation types(if have) is the same, if not try cast columns to annotated type - fn check_and_cast_type(&self, cols: &mut [ArrayRef]) -> Result<()> { + fn check_and_cast_type(&self, cols: &mut [VectorRef]) -> Result<()> { let return_types = &self.return_types; // allow ignore Return Type Annotation if return_types.is_empty() { @@ -161,21 +165,10 @@ impl Coprocessor { { let real_ty = col.data_type(); let anno_ty = datatype; - if real_ty != anno_ty { - { - // This`CastOption` allow for overflowly cast and int to float loosely cast etc.., - // check its doc for more information - *col = arrow::compute::cast::cast( - col.as_ref(), - anno_ty, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .context(ArrowSnafu)? - .into(); - } + if real_ty.as_arrow_type() != *anno_ty { + let array = col.to_arrow_array(); + let array = compute::cast(&array, anno_ty).context(ArrowSnafu)?; + *col = Helper::try_into_vector(array).context(TypeCastSnafu)?; } } } @@ -183,47 +176,6 @@ impl Coprocessor { } } -/// cast a `dyn Array` of type unsigned/int/float into a `dyn Vector` -fn try_into_vector(arg: Arc) -> Result> { - // wrap try_into_vector in here to convert `datatypes::error::Error` to `python::error::Error` - Helper::try_into_vector(arg).context(TypeCastSnafu) -} - -/// convert a `Vec` into a `Vec` only when they are of supported types -/// PyVector now only support unsigned&int8/16/32/64, float32/64 and bool when doing meanful arithmetics operation -fn try_into_py_vector(fetch_args: Vec) -> Result> { - let mut args: Vec = Vec::with_capacity(fetch_args.len()); - for (idx, arg) in fetch_args.into_iter().enumerate() { - let v: VectorRef = match arg.data_type() { - DataType::Float32 => try_into_vector::(arg)?, - DataType::Float64 => try_into_vector::(arg)?, - DataType::UInt8 => try_into_vector::(arg)?, - DataType::UInt16 => try_into_vector::(arg)?, - DataType::UInt32 => try_into_vector::(arg)?, - DataType::UInt64 => try_into_vector::(arg)?, - DataType::Int8 => try_into_vector::(arg)?, - DataType::Int16 => try_into_vector::(arg)?, - DataType::Int32 => try_into_vector::(arg)?, - DataType::Int64 => try_into_vector::(arg)?, - DataType::Utf8 => { - Arc::new(StringVector::try_from_arrow_array(arg).context(TypeCastSnafu)?) as _ - } - DataType::Boolean => { - Arc::new(BooleanVector::try_from_arrow_array(arg).context(TypeCastSnafu)?) as _ - } - _ => { - return ret_other_error_with(format!( - "Unsupported data type at column {idx}: {:?} for coprocessor", - arg.data_type() - )) - .fail() - } - }; - args.push(PyVector::from(v)); - } - Ok(args) -} - /// convert a tuple of `PyVector` or one `PyVector`(wrapped in a Python Object Ref[`PyObjectRef`]) /// to a `Vec` /// by default, a constant(int/float/bool) gives the a constant array of same length with input args @@ -231,7 +183,7 @@ fn try_into_columns( obj: &PyObjectRef, vm: &VirtualMachine, col_len: usize, -) -> Result> { +) -> Result> { if is_instance::(obj, vm) { let tuple = obj.payload::().with_context(|| { ret_other_error_with(format!("can't cast obj {:?} to PyTuple)", obj)) @@ -239,7 +191,7 @@ fn try_into_columns( let cols = tuple .iter() .map(|obj| py_vec_obj_to_array(obj, vm, col_len)) - .collect::>>()?; + .collect::>>()?; Ok(cols) } else { let col = py_vec_obj_to_array(obj, vm, col_len)?; @@ -249,27 +201,16 @@ fn try_into_columns( /// select columns according to `fetch_names` from `rb` /// and cast them into a Vec of PyVector -fn select_from_rb(rb: &DfRecordBatch, fetch_names: &[String]) -> Result> { - let field_map: HashMap<&String, usize> = rb - .schema() - .fields +fn select_from_rb(rb: &RecordBatch, fetch_names: &[String]) -> Result> { + fetch_names .iter() - .enumerate() - .map(|(idx, field)| (&field.name, idx)) - .collect(); - let fetch_idx: Vec = fetch_names - .iter() - .map(|field| { - field_map.get(field).copied().context(OtherSnafu { - reason: format!("Can't found field name {field}"), - }) + .map(|name| { + let vector = rb.column_by_name(name).with_context(|| OtherSnafu { + reason: format!("Can't find field name {}", name), + })?; + Ok(PyVector::from(vector.clone())) }) - .collect::>>()?; - let fetch_args: Vec> = fetch_idx - .into_iter() - .map(|idx| rb.column(idx).clone()) - .collect(); - try_into_py_vector(fetch_args) + .collect() } /// match between arguments' real type and annotation types @@ -277,12 +218,12 @@ fn select_from_rb(rb: &DfRecordBatch, fetch_names: &[String]) -> Result Result<()> { for (idx, arg) in args.iter().enumerate() { let anno_ty = copr.arg_types[idx].to_owned(); let real_ty = arg.to_arrow_array().data_type().to_owned(); - let is_nullable: bool = rb.schema().fields[idx].is_nullable; + let is_nullable: bool = rb.schema.column_schemas()[idx].is_nullable(); ensure!( anno_ty .to_owned() @@ -323,31 +264,32 @@ fn set_items_in_scope( /// The coprocessor function accept a python script and a Record Batch: /// ## What it does -/// 1. it take a python script and a [`DfRecordBatch`], extract columns and annotation info according to `args` given in decorator in python script +/// 1. it take a python script and a [`RecordBatch`], extract columns and annotation info according to `args` given in decorator in python script /// 2. execute python code and return a vector or a tuple of vector, -/// 3. the returning vector(s) is assembled into a new [`DfRecordBatch`] according to `returns` in python decorator and return to caller +/// 3. the returning vector(s) is assembled into a new [`RecordBatch`] according to `returns` in python decorator and return to caller /// /// # Example /// /// ```ignore /// use std::sync::Arc; -/// use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -/// use datatypes::arrow::array::PrimitiveArray; -/// use datatypes::arrow::datatypes::{DataType, Field, Schema}; +/// use common_recordbatch::RecordBatch; +/// use datatypes::prelude::*; +/// use datatypes::schema::{ColumnSchema, Schema}; +/// use datatypes::vectors::{Float32Vector, Float64Vector}; /// use common_function::scalars::python::exec_coprocessor; /// let python_source = r#" /// @copr(args=["cpu", "mem"], returns=["perf", "what"]) /// def a(cpu, mem): /// return cpu + mem, cpu - mem /// "#; -/// let cpu_array = PrimitiveArray::from_slice([0.9f32, 0.8, 0.7, 0.6]); -/// let mem_array = PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4]); -/// let schema = Arc::new(Schema::from(vec![ -/// Field::new("cpu", DataType::Float32, false), -/// Field::new("mem", DataType::Float64, false), +/// let cpu_array = Float32Vector::from_slice([0.9f32, 0.8, 0.7, 0.6]); +/// let mem_array = Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4]); +/// let schema = Arc::new(Schema::new(vec![ +/// ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), false), +/// ColumnSchema::new("mem", ConcreteDataType::float64_datatype(), false), /// ])); /// let rb = -/// DfRecordBatch::try_new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap(); +/// RecordBatch::new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap(); /// let ret = exec_coprocessor(python_source, &rb).unwrap(); /// assert_eq!(ret.column(0).len(), 4); /// ``` @@ -357,7 +299,7 @@ fn set_items_in_scope( /// /// Currently support types are `u8`, `u16`, `u32`, `u64`, `i8`, `i16`, `i32`, `i64` and `f16`, `f32`, `f64` /// -/// use `f64 | None` to mark if returning column is nullable like in [`DfRecordBatch`]'s schema's [`Field`]'s is_nullable +/// use `f64 | None` to mark if returning column is nullable like in [`RecordBatch`]'s schema's [`ColumnSchema`]'s is_nullable /// /// you can also use single underscore `_` to let coprocessor infer what type it is, so `_` and `_ | None` are both valid in type annotation. /// Note: using `_` means not nullable column, using `_ | None` means nullable column @@ -373,7 +315,7 @@ fn set_items_in_scope( /// You can return constant in python code like `return 1, 1.0, True` /// which create a constant array(with same value)(currently support int, float and bool) as column on return #[cfg(test)] -pub fn exec_coprocessor(script: &str, rb: &DfRecordBatch) -> Result { +pub fn exec_coprocessor(script: &str, rb: &RecordBatch) -> Result { // 1. parse the script and check if it's only a function with `@coprocessor` decorator, and get `args` and `returns`, // 2. also check for exist of `args` in `rb`, if not found, return error // TODO(discord9): cache the result of parse_copr @@ -383,7 +325,7 @@ pub fn exec_coprocessor(script: &str, rb: &DfRecordBatch) -> Result pub(crate) fn exec_with_cached_vm( copr: &Coprocessor, - rb: &DfRecordBatch, + rb: &RecordBatch, args: Vec, vm: &Arc, ) -> Result { @@ -401,7 +343,7 @@ pub(crate) fn exec_with_cached_vm( // 5. get returns as either a PyVector or a PyTuple, and naming schema them according to `returns` let col_len = rb.num_rows(); - let mut cols: Vec = try_into_columns(&ret, vm, col_len)?; + let mut cols = try_into_columns(&ret, vm, col_len)?; ensure!( cols.len() == copr.deco_args.ret_names.len(), OtherSnafu { @@ -417,11 +359,7 @@ pub(crate) fn exec_with_cached_vm( copr.check_and_cast_type(&mut cols)?; // 6. return a assembled DfRecordBatch let schema = copr.gen_schema(&cols)?; - let res_rb = DfRecordBatch::try_new(schema.clone(), cols).context(ArrowSnafu)?; - Ok(RecordBatch { - schema: Arc::new(Schema::try_from(schema).context(TypeCastSnafu)?), - df_recordbatch: res_rb, - }) + RecordBatch::new(schema, cols).context(NewRecordBatchSnafu) }) } @@ -459,7 +397,7 @@ pub(crate) fn init_interpreter() -> Arc { } /// using a parsed `Coprocessor` struct as input to execute python code -pub(crate) fn exec_parsed(copr: &Coprocessor, rb: &DfRecordBatch) -> Result { +pub(crate) fn exec_parsed(copr: &Coprocessor, rb: &RecordBatch) -> Result { // 3. get args from `rb`, and cast them into PyVector let args: Vec = select_from_rb(rb, &copr.deco_args.arg_names)?; check_args_anno_real_type(&args, copr, rb)?; @@ -477,7 +415,7 @@ pub(crate) fn exec_parsed(copr: &Coprocessor, rb: &DfRecordBatch) -> Result StdResult { diff --git a/src/script/src/python/engine.rs b/src/script/src/python/engine.rs index 7ad5390f7b..848bf71d8b 100644 --- a/src/script/src/python/engine.rs +++ b/src/script/src/python/engine.rs @@ -59,7 +59,7 @@ impl Stream for CoprStream { match Pin::new(&mut self.stream).poll_next(cx) { Poll::Pending => Poll::Pending, Poll::Ready(Some(Ok(recordbatch))) => { - let batch = exec_parsed(&self.copr, &recordbatch.df_recordbatch) + let batch = exec_parsed(&self.copr, &recordbatch) .map_err(BoxedError::new) .context(ExternalSnafu)?; @@ -149,8 +149,8 @@ mod tests { use catalog::{CatalogList, CatalogProvider, SchemaProvider}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_recordbatch::util; - use datafusion_common::field_util::{FieldExt, SchemaExt}; - use datatypes::arrow::array::{Float64Array, Int64Array}; + use datatypes::prelude::ScalarVector; + use datatypes::vectors::{Float64Vector, Int64Vector}; use query::QueryEngineFactory; use table::table::numbers::NumbersTable; @@ -177,6 +177,7 @@ mod tests { let script_engine = PyEngine::new(query_engine.clone()); + // To avoid divide by zero, the script divides `add(a, b)` by `g.sqrt(c + 1)` instead of `g.sqrt(c)` let script = r#" import greptime as g def add(a, b): @@ -184,7 +185,7 @@ def add(a, b): @copr(args=["a", "b", "c"], returns = ["r"], sql="select number as a,number as b,number as c from numbers limit 100") def test(a, b, c): - return add(a, b) / g.sqrt(c) + return add(a, b) / g.sqrt(c + 1) "#; let script = script_engine .compile(script, CompileContext::default()) @@ -197,15 +198,18 @@ def test(a, b, c): assert_eq!(1, numbers.len()); let number = &numbers[0]; - assert_eq!(number.df_recordbatch.num_columns(), 1); - assert_eq!("r", number.schema.arrow_schema().field(0).name()); + assert_eq!(number.num_columns(), 1); + assert_eq!("r", number.schema.column_schemas()[0].name); - let columns = number.df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(100, columns[0].len()); - let rows = columns[0].as_any().downcast_ref::().unwrap(); - assert!(rows.value(0).is_nan()); - assert_eq!((99f64 + 99f64) / 99f64.sqrt(), rows.value(99)) + assert_eq!(1, number.num_columns()); + assert_eq!(100, number.column(0).len()); + let rows = number + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(0f64, rows.get_data(0).unwrap()); + assert_eq!((99f64 + 99f64) / 100f64.sqrt(), rows.get_data(99).unwrap()) } _ => unreachable!(), } @@ -229,15 +233,18 @@ def test(a): assert_eq!(1, numbers.len()); let number = &numbers[0]; - assert_eq!(number.df_recordbatch.num_columns(), 1); - assert_eq!("r", number.schema.arrow_schema().field(0).name()); + assert_eq!(number.num_columns(), 1); + assert_eq!("r", number.schema.column_schemas()[0].name); - let columns = number.df_recordbatch.columns(); - assert_eq!(1, columns.len()); - assert_eq!(50, columns[0].len()); - let rows = columns[0].as_any().downcast_ref::().unwrap(); - assert_eq!(0, rows.value(0)); - assert_eq!(98, rows.value(49)) + assert_eq!(1, number.num_columns()); + assert_eq!(50, number.column(0).len()); + let rows = number + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(0, rows.get_data(0).unwrap()); + assert_eq!(98, rows.get_data(49).unwrap()) } _ => unreachable!(), } diff --git a/src/script/src/python/error.rs b/src/script/src/python/error.rs index 9a77984149..6e20e86db0 100644 --- a/src/script/src/python/error.rs +++ b/src/script/src/python/error.rs @@ -105,6 +105,12 @@ pub enum Error { #[snafu(backtrace)] source: common_recordbatch::error::Error, }, + + #[snafu(display("Failed to create record batch, source: {}", source))] + NewRecordBatch { + #[snafu(backtrace)] + source: common_recordbatch::error::Error, + }, } impl From for Error { @@ -121,7 +127,9 @@ impl ErrorExt for Error { | Error::PyRuntime { .. } | Error::Other { .. } => StatusCode::Internal, - Error::RecordBatch { source } => source.status_code(), + Error::RecordBatch { source } | Error::NewRecordBatch { source } => { + source.status_code() + } Error::DatabaseQuery { source } => source.status_code(), Error::TypeCast { source } => source.status_code(), diff --git a/src/script/src/python/test.rs b/src/script/src/python/test.rs index 4c0bcdcd25..49b511c101 100644 --- a/src/script/src/python/test.rs +++ b/src/script/src/python/test.rs @@ -12,19 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![allow(clippy::print_stdout, clippy::print_stderr)] -// for debug purpose, also this is already a -// test module so allow print_stdout shouldn't be a problem? use std::fs::File; use std::io::prelude::*; use std::path::Path; use std::sync::Arc; +use common_recordbatch::RecordBatch; use common_telemetry::{error, info}; use console::style; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; -use datatypes::arrow::array::PrimitiveArray; -use datatypes::arrow::datatypes::{DataType, Field, Schema}; +use datatypes::arrow::datatypes::DataType as ArrowDataType; +use datatypes::data_type::{ConcreteDataType, DataType}; +use datatypes::schema::{ColumnSchema, Schema}; +use datatypes::vectors::{Float32Vector, Float64Vector, Int64Vector, VectorRef}; use ron::from_str as from_ron_string; use rustpython_parser::parser; use serde::{Deserialize, Serialize}; @@ -63,19 +62,26 @@ enum Predicate { #[derive(Serialize, Deserialize, Debug)] struct ColumnInfo { - pub ty: DataType, + pub ty: ArrowDataType, pub len: usize, } -fn create_sample_recordbatch() -> DfRecordBatch { - let cpu_array = PrimitiveArray::from_slice([0.9f32, 0.8, 0.7, 0.6]); - let mem_array = PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4]); - let schema = Arc::new(Schema::from(vec![ - Field::new("cpu", DataType::Float32, false), - Field::new("mem", DataType::Float64, false), +fn create_sample_recordbatch() -> RecordBatch { + let cpu_array = Float32Vector::from_slice([0.9f32, 0.8, 0.7, 0.6]); + let mem_array = Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4]); + let schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), false), + ColumnSchema::new("mem", ConcreteDataType::float64_datatype(), false), ])); - DfRecordBatch::try_new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap() + RecordBatch::new( + schema, + [ + Arc::new(cpu_array) as VectorRef, + Arc::new(mem_array) as VectorRef, + ], + ) + .unwrap() } /// test cases which read from a .ron file, deser, @@ -120,37 +126,27 @@ fn run_ron_testcases() { } Predicate::ExecIsOk { fields, columns } => { let rb = create_sample_recordbatch(); - let res = coprocessor::exec_coprocessor(&testcase.code, &rb); - if res.is_err() { - dbg!(&res); - } - assert!(res.is_ok()); - let res = res.unwrap(); + let res = coprocessor::exec_coprocessor(&testcase.code, &rb).unwrap(); fields .iter() - .zip(&res.schema.arrow_schema().fields) - .map(|(anno, real)| { + .zip(res.schema.column_schemas()) + .for_each(|(anno, real)| { assert!( - anno.datatype.clone().unwrap() == real.data_type - && anno.is_nullable == real.is_nullable, + anno.datatype.as_ref().unwrap() == &real.data_type.as_arrow_type() + && anno.is_nullable == real.is_nullable(), "Fields expected to be {anno:#?}, actual {real:#?}" ); - }) - .count(); - columns - .iter() - .zip(res.df_recordbatch.columns()) - .map(|(anno, real)| { - assert!( - &anno.ty == real.data_type() && anno.len == real.len(), - "Type or length not match! Expect [{:#?}; {}], actual [{:#?}; {}]", - anno.ty, - anno.len, - real.data_type(), - real.len() - ); - }) - .count(); + }); + columns.iter().zip(res.columns()).for_each(|(anno, real)| { + assert!( + anno.ty == real.data_type().as_arrow_type() && anno.len == real.len(), + "Type or length not match! Expect [{:#?}; {}], actual [{:#?}; {}]", + anno.ty, + anno.len, + real.data_type(), + real.len() + ); + }); } Predicate::ExecIsErr { reason: part_reason, @@ -229,7 +225,7 @@ def calc_rvs(open_time, close): rv_180d = vector([calc_rv(close, open_time, timepoint, datetime("180d"))]) return rv_7d, rv_15d, rv_30d, rv_60d, rv_90d, rv_180d "#; - let close_array = PrimitiveArray::from_slice([ + let close_array = Float32Vector::from_slice([ 10106.79f32, 10106.09, 10108.73, @@ -242,17 +238,20 @@ def calc_rvs(open_time, close): 10117.08, 10120.43, ]); - let open_time_array = PrimitiveArray::from_slice([ + let open_time_array = Int64Vector::from_slice([ 300i64, 900i64, 1200i64, 1800i64, 2400i64, 3000i64, 3600i64, 4200i64, 4800i64, 5400i64, 6000i64, ]); - let schema = Arc::new(Schema::from(vec![ - Field::new("close", DataType::Float32, false), - Field::new("open_time", DataType::Int64, false), + let schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("close", ConcreteDataType::float32_datatype(), false), + ColumnSchema::new("open_time", ConcreteDataType::int64_datatype(), false), ])); - let rb = DfRecordBatch::try_new( + let rb = RecordBatch::new( schema, - vec![Arc::new(close_array), Arc::new(open_time_array)], + [ + Arc::new(close_array) as VectorRef, + Arc::new(open_time_array) as VectorRef, + ], ) .unwrap(); let ret = coprocessor::exec_coprocessor(python_source, &rb); @@ -291,14 +290,20 @@ def a(cpu, mem): ref = log2(fed/prev(fed)) return (0.5 < cpu) & ~( cpu >= 0.75) "#; - let cpu_array = PrimitiveArray::from_slice([0.9f32, 0.8, 0.7, 0.3]); - let mem_array = PrimitiveArray::from_slice([0.1f64, 0.2, 0.3, 0.4]); - let schema = Arc::new(Schema::from(vec![ - Field::new("cpu", DataType::Float32, false), - Field::new("mem", DataType::Float64, false), + let cpu_array = Float32Vector::from_slice([0.9f32, 0.8, 0.7, 0.3]); + let mem_array = Float64Vector::from_slice([0.1f64, 0.2, 0.3, 0.4]); + let schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("cpu", ConcreteDataType::float32_datatype(), false), + ColumnSchema::new("mem", ConcreteDataType::float64_datatype(), false), ])); - let rb = - DfRecordBatch::try_new(schema, vec![Arc::new(cpu_array), Arc::new(mem_array)]).unwrap(); + let rb = RecordBatch::new( + schema, + [ + Arc::new(cpu_array) as VectorRef, + Arc::new(mem_array) as VectorRef, + ], + ) + .unwrap(); let ret = coprocessor::exec_coprocessor(python_source, &rb); if let Err(Error::PyParse { backtrace: _, diff --git a/src/script/src/python/utils.rs b/src/script/src/python/utils.rs index fcc0bf3956..8f078c163c 100644 --- a/src/script/src/python/utils.rs +++ b/src/script/src/python/utils.rs @@ -14,10 +14,12 @@ use std::sync::Arc; -use datafusion::arrow::array::{ArrayRef, BooleanArray, NullArray, PrimitiveArray, Utf8Array}; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue as DFColValue; -use datatypes::arrow::datatypes::DataType; +use datatypes::prelude::ScalarVector; +use datatypes::vectors::{ + BooleanVector, Float64Vector, Helper, Int64Vector, NullVector, StringVector, VectorRef, +}; use rustpython_vm::builtins::{PyBaseExceptionRef, PyBool, PyFloat, PyInt, PyList, PyStr}; use rustpython_vm::{PyObjectRef, PyPayload, PyRef, VirtualMachine}; use snafu::{Backtrace, GenerateImplicitData, OptionExt, ResultExt}; @@ -54,26 +56,26 @@ pub fn py_vec_obj_to_array( obj: &PyObjectRef, vm: &VirtualMachine, col_len: usize, -) -> Result { +) -> Result { // It's ugly, but we can't find a better way right now. if is_instance::(obj, vm) { let pyv = obj.payload::().with_context(|| { ret_other_error_with(format!("can't cast obj {:?} to PyVector", obj)) })?; - Ok(pyv.to_arrow_array()) + Ok(pyv.as_vector_ref()) } else if is_instance::(obj, vm) { let val = obj .to_owned() .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = PrimitiveArray::from_vec(vec![val; col_len]); + let ret = Int64Vector::from_iterator(std::iter::repeat(val).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let val = obj .to_owned() .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = PrimitiveArray::from_vec(vec![val; col_len]); + let ret = Float64Vector::from_iterator(std::iter::repeat(val).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let val = obj @@ -81,7 +83,7 @@ pub fn py_vec_obj_to_array( .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = BooleanArray::from_iter(std::iter::repeat(Some(val)).take(col_len)); + let ret = BooleanVector::from_iterator(std::iter::repeat(val).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let val = obj @@ -89,7 +91,7 @@ pub fn py_vec_obj_to_array( .try_into_value::(vm) .map_err(|e| format_py_error(e, vm))?; - let ret = Utf8Array::::from_iter(std::iter::repeat(Some(val)).take(col_len)); + let ret = StringVector::from_iterator(std::iter::repeat(val.as_str()).take(col_len)); Ok(Arc::new(ret) as _) } else if is_instance::(obj, vm) { let columnar_value = @@ -101,9 +103,9 @@ pub fn py_vec_obj_to_array( let array = ScalarValue::iter_to_array(scalars.into_iter()) .context(error::DataFusionSnafu)?; - Ok(array) + Helper::try_into_vector(array).context(error::TypeCastSnafu) } - None => Ok(Arc::new(NullArray::new(DataType::Null, 0))), + None => Ok(Arc::new(NullVector::new(0))), }, _ => unreachable!(), } diff --git a/src/script/src/python/vector.rs b/src/script/src/python/vector.rs index 448df3e62e..47fae45ed1 100644 --- a/src/script/src/python/vector.rs +++ b/src/script/src/python/vector.rs @@ -19,17 +19,17 @@ use std::sync::Arc; use common_time::date::Date; use common_time::datetime::DateTime; use common_time::timestamp::Timestamp; -use datatypes::arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use datatypes::arrow::array::{ + Array, ArrayRef, BooleanArray, Float64Array, Int64Array, UInt64Array, +}; use datatypes::arrow::compute; -use datatypes::arrow::compute::cast::{self, CastOptions}; -use datatypes::arrow::compute::{arithmetics, comparison}; -use datatypes::arrow::datatypes::DataType; -use datatypes::arrow::scalar::{PrimitiveScalar, Scalar}; -use datatypes::data_type::ConcreteDataType; +use datatypes::arrow::compute::kernels::{arithmetic, boolean, comparison}; +use datatypes::arrow::datatypes::DataType as ArrowDataType; +use datatypes::arrow::error::Result as ArrowResult; +use datatypes::data_type::{ConcreteDataType, DataType}; use datatypes::prelude::Value; -use datatypes::value::OrderedFloat; -use datatypes::vectors::{Helper, NullVector, VectorBuilder, VectorRef}; -use datatypes::{arrow, value}; +use datatypes::value::{self, OrderedFloat}; +use datatypes::vectors::{Helper, NullVector, VectorRef}; use rustpython_vm::builtins::{PyBaseExceptionRef, PyBool, PyBytes, PyFloat, PyInt, PyNone, PyStr}; use rustpython_vm::function::{Either, OptionalArg, PyComparisonValue}; use rustpython_vm::protocol::{PyMappingMethods, PySequenceMethods}; @@ -55,120 +55,71 @@ impl From for PyVector { fn emit_cast_error( vm: &VirtualMachine, - src_ty: &DataType, - dst_ty: &DataType, + src_ty: &ArrowDataType, + dst_ty: &ArrowDataType, ) -> PyBaseExceptionRef { vm.new_type_error(format!( "Can't cast source operand of type {:?} into target type of {:?}", src_ty, dst_ty )) } -fn arrow2_rsub_scalar( - arr: &dyn Array, - val: &dyn Scalar, - _vm: &VirtualMachine, -) -> PyResult> { - // b - a => a * (-1) + b - let neg = arithmetics::mul_scalar(arr, &PrimitiveScalar::new(DataType::Int64, Some(-1i64))); - Ok(arithmetics::add_scalar(neg.as_ref(), val)) + +/// Performs `val - arr`. +fn arrow_rsub(arr: &dyn Array, val: &dyn Array, vm: &VirtualMachine) -> PyResult { + arithmetic::subtract_dyn(val, arr).map_err(|e| vm.new_type_error(format!("rsub error: {}", e))) } -fn arrow2_rtruediv_scalar( - arr: &dyn Array, - val: &dyn Scalar, - vm: &VirtualMachine, -) -> PyResult> { - // val / arr => one_arr / arr * val (this is simpler to write) - let one_arr: Box = if is_float(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1f64; arr.len()])) - } else if is_integer(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1i64; arr.len()])) - } else { - return Err(vm.new_not_implemented_error(format!( - "truediv of {:?} Scalar with {:?} Array is not supported", - val.data_type(), - arr.data_type() - ))); - }; - let tmp = arithmetics::mul_scalar(one_arr.as_ref(), val); - Ok(arithmetics::div(tmp.as_ref(), arr)) +/// Performs `val / arr` +fn arrow_rtruediv(arr: &dyn Array, val: &dyn Array, vm: &VirtualMachine) -> PyResult { + arithmetic::divide_dyn(val, arr) + .map_err(|e| vm.new_type_error(format!("rtruediv error: {}", e))) } -fn arrow2_rfloordiv_scalar( - arr: &dyn Array, - val: &dyn Scalar, - vm: &VirtualMachine, -) -> PyResult> { - // val // arr => one_arr // arr * val (this is simpler to write) - let one_arr: Box = if is_float(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1f64; arr.len()])) - } else if is_integer(arr.data_type()) { - Box::new(PrimitiveArray::from_values(vec![1i64; arr.len()])) - } else { - return Err(vm.new_not_implemented_error(format!( - "truediv of {:?} Scalar with {:?} Array is not supported", - val.data_type(), - arr.data_type() - ))); - }; - let tmp = arithmetics::mul_scalar(one_arr.as_ref(), val); - - Ok(arrow::compute::cast::cast( - arithmetics::div(tmp.as_ref(), arr).as_ref(), - &DataType::Int64, - cast::CastOptions { - wrapped: false, - partial: true, - }, - ) - .unwrap()) +/// Performs `val / arr`, but cast to i64. +fn arrow_rfloordiv(arr: &dyn Array, val: &dyn Array, vm: &VirtualMachine) -> PyResult { + let array = arithmetic::divide_dyn(val, arr) + .map_err(|e| vm.new_type_error(format!("rtruediv divide error: {}", e)))?; + compute::cast(&array, &ArrowDataType::Int64) + .map_err(|e| vm.new_type_error(format!("rtruediv cast error: {}", e))) } -fn wrap_result( - f: F, -) -> impl Fn(&dyn Array, &dyn Scalar, &VirtualMachine) -> PyResult> +fn wrap_result(f: F) -> impl Fn(&dyn Array, &dyn Array, &VirtualMachine) -> PyResult where - F: Fn(&dyn Array, &dyn Scalar) -> Box, + F: Fn(&dyn Array, &dyn Array) -> ArrowResult, { - move |left, right, _vm| Ok(f(left, right)) + move |left, right, vm| { + f(left, right).map_err(|e| vm.new_type_error(format!("arithmetic error {}", e))) + } } -fn is_float(datatype: &DataType) -> bool { +fn is_float(datatype: &ArrowDataType) -> bool { matches!( datatype, - DataType::Float16 | DataType::Float32 | DataType::Float64 + ArrowDataType::Float16 | ArrowDataType::Float32 | ArrowDataType::Float64 ) } -fn is_integer(datatype: &DataType) -> bool { - is_signed(datatype) || is_unsigned(datatype) -} - -fn is_signed(datatype: &DataType) -> bool { +fn is_signed(datatype: &ArrowDataType) -> bool { matches!( datatype, - DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 + ArrowDataType::Int8 | ArrowDataType::Int16 | ArrowDataType::Int32 | ArrowDataType::Int64 ) } -fn is_unsigned(datatype: &DataType) -> bool { +fn is_unsigned(datatype: &ArrowDataType) -> bool { matches!( datatype, - DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 + ArrowDataType::UInt8 + | ArrowDataType::UInt16 + | ArrowDataType::UInt32 + | ArrowDataType::UInt64 ) } -fn cast(array: ArrayRef, target_type: &DataType, vm: &VirtualMachine) -> PyResult> { - cast::cast( - array.as_ref(), - target_type, - CastOptions { - wrapped: true, - partial: true, - }, - ) - .map_err(|e| vm.new_type_error(e.to_string())) +fn cast(array: ArrayRef, target_type: &ArrowDataType, vm: &VirtualMachine) -> PyResult { + compute::cast(&array, target_type).map_err(|e| vm.new_type_error(e.to_string())) } + fn from_debug_error(err: impl std::fmt::Debug, vm: &VirtualMachine) -> PyBaseExceptionRef { vm.new_runtime_error(format!("Runtime Error: {err:#?}")) } @@ -194,7 +145,7 @@ impl PyVector { } let datatype = get_concrete_type(&elements[0], vm)?; - let mut buf = VectorBuilder::with_capacity(datatype.clone(), elements.len()); + let mut buf = datatype.create_mutable_vector(elements.len()); for obj in elements.drain(..) { let val = if let Some(v) = @@ -207,11 +158,12 @@ impl PyVector { obj, datatype ))); }; - buf.push(&val); + // Safety: `pyobj_try_to_typed_val()` has checked the data type. + buf.push_value_ref(val.as_value_ref()).unwrap(); } Ok(PyVector { - vector: buf.finish(), + vector: buf.to_vector(), }) } else { Ok(PyVector::default()) @@ -232,23 +184,26 @@ impl PyVector { fn scalar_arith_op( &self, other: PyObjectRef, - target_type: Option, + target_type: Option, op: F, vm: &VirtualMachine, ) -> PyResult where - F: Fn(&dyn Array, &dyn Scalar, &VirtualMachine) -> PyResult>, + F: Fn(&dyn Array, &dyn Array, &VirtualMachine) -> PyResult, { // the right operand only support PyInt or PyFloat, let (right, right_type) = { if is_instance::(&other, vm) { other .try_into_value::(vm) - .map(|v| (value::Value::Int64(v), DataType::Int64))? + .map(|v| (value::Value::Int64(v), ArrowDataType::Int64))? } else if is_instance::(&other, vm) { - other - .try_into_value::(vm) - .map(|v| (value::Value::Float64(OrderedFloat(v)), DataType::Float64))? + other.try_into_value::(vm).map(|v| { + ( + value::Value::Float64(OrderedFloat(v)), + ArrowDataType::Float64, + ) + })? } else { return Err(vm.new_type_error(format!( "Can't cast right operand into Scalar of Int or Float, actual: {}", @@ -264,45 +219,38 @@ impl PyVector { // TODO(discord9): found better way to cast between signed and unsigned type let target_type = target_type.unwrap_or_else(|| { if is_signed(left_type) && is_signed(right_type) { - DataType::Int64 + ArrowDataType::Int64 } else if is_unsigned(left_type) && is_unsigned(right_type) { - DataType::UInt64 + ArrowDataType::UInt64 } else { - DataType::Float64 + ArrowDataType::Float64 } }); let left = cast(left, &target_type, vm)?; - let right: Box = if is_float(&target_type) { + let left_len = left.len(); + + // Convert `right` to an array of `target_type`. + let right: Box = if is_float(&target_type) { match right { - value::Value::Int64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(v as f64))) - } - value::Value::UInt64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(v as f64))) - } + value::Value::Int64(v) => Box::new(Float64Array::from_value(v as f64, left_len)), + value::Value::UInt64(v) => Box::new(Float64Array::from_value(v as f64, left_len)), value::Value::Float64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(f64::from(v)))) + Box::new(Float64Array::from_value(f64::from(v), left_len)) } _ => unreachable!(), } } else if is_signed(&target_type) { match right { - value::Value::Int64(v) => Box::new(PrimitiveScalar::new(target_type, Some(v))), - value::Value::UInt64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(v as i64))) - } - value::Value::Float64(v) => { - Box::new(PrimitiveScalar::new(DataType::Float64, Some(v.0 as i64))) - } + value::Value::Int64(v) => Box::new(Int64Array::from_value(v, left_len)), + value::Value::UInt64(v) => Box::new(Int64Array::from_value(v as i64, left_len)), + value::Value::Float64(v) => Box::new(Int64Array::from_value(v.0 as i64, left_len)), _ => unreachable!(), } } else if is_unsigned(&target_type) { match right { - value::Value::Int64(v) => Box::new(PrimitiveScalar::new(target_type, Some(v))), - value::Value::UInt64(v) => Box::new(PrimitiveScalar::new(target_type, Some(v))), - value::Value::Float64(v) => { - Box::new(PrimitiveScalar::new(target_type, Some(f64::from(v)))) - } + value::Value::Int64(v) => Box::new(UInt64Array::from_value(v as u64, left_len)), + value::Value::UInt64(v) => Box::new(UInt64Array::from_value(v, left_len)), + value::Value::Float64(v) => Box::new(UInt64Array::from_value(v.0 as u64, left_len)), _ => unreachable!(), } } else { @@ -311,7 +259,7 @@ impl PyVector { let result = op(left.as_ref(), right.as_ref(), vm)?; - Ok(Helper::try_into_vector(&*result) + Ok(Helper::try_into_vector(result.clone()) .map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", @@ -324,12 +272,12 @@ impl PyVector { fn arith_op( &self, other: PyObjectRef, - target_type: Option, + target_type: Option, op: F, vm: &VirtualMachine, ) -> PyResult where - F: Fn(&dyn Array, &dyn Array) -> Box, + F: Fn(&dyn Array, &dyn Array) -> ArrowResult, { let right = other.downcast_ref::().ok_or_else(|| { vm.new_type_error(format!( @@ -345,20 +293,21 @@ impl PyVector { let target_type = target_type.unwrap_or_else(|| { if is_signed(left_type) && is_signed(right_type) { - DataType::Int64 + ArrowDataType::Int64 } else if is_unsigned(left_type) && is_unsigned(right_type) { - DataType::UInt64 + ArrowDataType::UInt64 } else { - DataType::Float64 + ArrowDataType::Float64 } }); let left = cast(left, &target_type, vm)?; let right = cast(right, &target_type, vm)?; - let result = op(left.as_ref(), right.as_ref()); + let result = op(left.as_ref(), right.as_ref()) + .map_err(|e| vm.new_type_error(format!("Can't compute op, error: {}", e)))?; - Ok(Helper::try_into_vector(&*result) + Ok(Helper::try_into_vector(result.clone()) .map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", @@ -372,27 +321,27 @@ impl PyVector { #[pymethod(magic)] fn add(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, wrap_result(arithmetics::add_scalar), vm) + self.scalar_arith_op(other, None, wrap_result(arithmetic::add_dyn), vm) } else { - self.arith_op(other, None, arithmetics::add, vm) + self.arith_op(other, None, arithmetic::add_dyn, vm) } } #[pymethod(magic)] fn sub(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, wrap_result(arithmetics::sub_scalar), vm) + self.scalar_arith_op(other, None, wrap_result(arithmetic::subtract_dyn), vm) } else { - self.arith_op(other, None, arithmetics::sub, vm) + self.arith_op(other, None, arithmetic::subtract_dyn, vm) } } #[pymethod(magic)] fn rsub(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, arrow2_rsub_scalar, vm) + self.scalar_arith_op(other, None, arrow_rsub, vm) } else { - self.arith_op(other, None, |a, b| arithmetics::sub(b, a), vm) + self.arith_op(other, None, |a, b| arithmetic::subtract_dyn(b, a), vm) } } @@ -400,9 +349,9 @@ impl PyVector { #[pymethod(magic)] fn mul(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, None, wrap_result(arithmetics::mul_scalar), vm) + self.scalar_arith_op(other, None, wrap_result(arithmetic::multiply_dyn), vm) } else { - self.arith_op(other, None, arithmetics::mul, vm) + self.arith_op(other, None, arithmetic::multiply_dyn, vm) } } @@ -411,24 +360,29 @@ impl PyVector { if is_pyobj_scalar(&other, vm) { self.scalar_arith_op( other, - Some(DataType::Float64), - wrap_result(arithmetics::div_scalar), + Some(ArrowDataType::Float64), + wrap_result(arithmetic::divide_dyn), vm, ) } else { - self.arith_op(other, Some(DataType::Float64), arithmetics::div, vm) + self.arith_op( + other, + Some(ArrowDataType::Float64), + arithmetic::divide_dyn, + vm, + ) } } #[pymethod(magic)] fn rtruediv(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { - self.scalar_arith_op(other, Some(DataType::Float64), arrow2_rtruediv_scalar, vm) + self.scalar_arith_op(other, Some(ArrowDataType::Float64), arrow_rtruediv, vm) } else { self.arith_op( other, - Some(DataType::Float64), - |a, b| arithmetics::div(b, a), + Some(ArrowDataType::Float64), + |a, b| arithmetic::divide_dyn(b, a), vm, ) } @@ -439,12 +393,17 @@ impl PyVector { if is_pyobj_scalar(&other, vm) { self.scalar_arith_op( other, - Some(DataType::Int64), - wrap_result(arithmetics::div_scalar), + Some(ArrowDataType::Int64), + wrap_result(arithmetic::divide_dyn), vm, ) } else { - self.arith_op(other, Some(DataType::Int64), arithmetics::div, vm) + self.arith_op( + other, + Some(ArrowDataType::Int64), + arithmetic::divide_dyn, + vm, + ) } } @@ -452,12 +411,12 @@ impl PyVector { fn rfloordiv(&self, other: PyObjectRef, vm: &VirtualMachine) -> PyResult { if is_pyobj_scalar(&other, vm) { // FIXME: DataType convert problem, target_type should be inferred? - self.scalar_arith_op(other, Some(DataType::Int64), arrow2_rfloordiv_scalar, vm) + self.scalar_arith_op(other, Some(ArrowDataType::Int64), arrow_rfloordiv, vm) } else { self.arith_op( other, - Some(DataType::Int64), - |a, b| arithmetics::div(b, a), + Some(ArrowDataType::Int64), + |a, b| arithmetic::divide_dyn(b, a), vm, ) } @@ -533,9 +492,9 @@ impl PyVector { .as_any() .downcast_ref::() .ok_or_else(|| vm.new_type_error(format!("Can't cast {left:#?} as a Boolean Array")))?; - let res = compute::boolean::and(left, right).map_err(|err| from_debug_error(err, vm))?; + let res = boolean::and(left, right).map_err(|err| from_debug_error(err, vm))?; let res = Arc::new(res) as ArrayRef; - let ret = Helper::try_into_vector(&*res).map_err(|err| from_debug_error(err, vm))?; + let ret = Helper::try_into_vector(res.clone()).map_err(|err| from_debug_error(err, vm))?; Ok(ret.into()) } @@ -551,9 +510,9 @@ impl PyVector { .as_any() .downcast_ref::() .ok_or_else(|| vm.new_type_error(format!("Can't cast {left:#?} as a Boolean Array")))?; - let res = compute::boolean::or(left, right).map_err(|err| from_debug_error(err, vm))?; + let res = boolean::or(left, right).map_err(|err| from_debug_error(err, vm))?; let res = Arc::new(res) as ArrayRef; - let ret = Helper::try_into_vector(&*res).map_err(|err| from_debug_error(err, vm))?; + let ret = Helper::try_into_vector(res.clone()).map_err(|err| from_debug_error(err, vm))?; Ok(ret.into()) } @@ -565,9 +524,9 @@ impl PyVector { .as_any() .downcast_ref::() .ok_or_else(|| vm.new_type_error(format!("Can't cast {left:#?} as a Boolean Array")))?; - let res = compute::boolean::not(left); + let res = boolean::not(left).map_err(|err| from_debug_error(err, vm))?; let res = Arc::new(res) as ArrayRef; - let ret = Helper::try_into_vector(&*res).map_err(|err| from_debug_error(err, vm))?; + let ret = Helper::try_into_vector(res.clone()).map_err(|err| from_debug_error(err, vm))?; Ok(ret.into()) } @@ -580,15 +539,15 @@ impl PyVector { #[pymethod(name = "filter")] fn filter(&self, other: PyVectorRef, vm: &VirtualMachine) -> PyResult { let left = self.to_arrow_array(); - let right: ArrayRef = other.to_arrow_array(); + let right = other.to_arrow_array(); let filter = right.as_any().downcast_ref::(); match filter { Some(filter) => { - let res = compute::filter::filter(left.as_ref(), filter); + let res = compute::filter(left.as_ref(), filter); let res = res.map_err(|err| vm.new_runtime_error(format!("Arrow Error: {err:#?}")))?; - let ret = Helper::try_into_vector(&*res).map_err(|e| { + let ret = Helper::try_into_vector(res.clone()).map_err(|e| { vm.new_type_error(format!( "Can't cast result into vector, result: {:?}, err: {:?}", res, e @@ -618,14 +577,10 @@ impl PyVector { .ok_or_else(|| { vm.new_type_error(format!("Can't cast {seq:#?} as a Boolean Array")) })?; - // let left = self.to_arrow_array(); - let res = compute::filter::filter(self.to_arrow_array().as_ref(), mask) + let res = compute::filter(self.to_arrow_array().as_ref(), mask) .map_err(|err| vm.new_runtime_error(format!("Arrow Error: {err:#?}")))?; - let ret = Helper::try_into_vector(&*res).map_err(|e| { - vm.new_type_error(format!( - "Can't cast result into vector, result: {:?}, err: {:?}", - res, e - )) + let ret = Helper::try_into_vector(res.clone()).map_err(|e| { + vm.new_type_error(format!("Can't cast result into vector, err: {:?}", e)) })?; Ok(Self::from(ret).into_pyobject(vm)) } else { @@ -654,9 +609,9 @@ impl PyVector { let (mut range, step, slice_len) = slice.adjust_indices(self.len()); let vector = self.as_vector_ref(); - let mut buf = VectorBuilder::with_capacity(vector.data_type(), slice_len); + let mut buf = vector.data_type().create_mutable_vector(slice_len); if slice_len == 0 { - let v: PyVector = buf.finish().into(); + let v: PyVector = buf.to_vector().into(); Ok(v.into_pyobject(vm)) } else if step == 1 { let v: PyVector = vector.slice(range.next().unwrap_or(0), slice_len).into(); @@ -664,15 +619,17 @@ impl PyVector { } else if step.is_negative() { // Negative step require special treatment for i in range.rev().step_by(step.unsigned_abs()) { - buf.push(&vector.get(i)) + // Safety: This mutable vector is created from the vector's data type. + buf.push_value_ref(vector.get_ref(i)).unwrap(); } - let v: PyVector = buf.finish().into(); + let v: PyVector = buf.to_vector().into(); Ok(v.into_pyobject(vm)) } else { for i in range.step_by(step.unsigned_abs()) { - buf.push(&vector.get(i)) + // Safety: This mutable vector is created from the vector's data type. + buf.push_value_ref(vector.get_ref(i)).unwrap(); } - let v: PyVector = buf.finish().into(); + let v: PyVector = buf.to_vector().into(); Ok(v.into_pyobject(vm)) } } @@ -693,19 +650,19 @@ impl PyVector { /// get corresponding arrow op function according to given PyComaprsionOp /// /// TODO(discord9): impl scalar version function -fn get_arrow_op(op: PyComparisonOp) -> impl Fn(&dyn Array, &dyn Array) -> Box { +fn get_arrow_op(op: PyComparisonOp) -> impl Fn(&dyn Array, &dyn Array) -> ArrowResult { let op_bool_arr = match op { - PyComparisonOp::Eq => comparison::eq, - PyComparisonOp::Ne => comparison::neq, - PyComparisonOp::Gt => comparison::gt, - PyComparisonOp::Lt => comparison::lt, - PyComparisonOp::Ge => comparison::gt_eq, - PyComparisonOp::Le => comparison::lt_eq, + PyComparisonOp::Eq => comparison::eq_dyn, + PyComparisonOp::Ne => comparison::neq_dyn, + PyComparisonOp::Gt => comparison::gt_dyn, + PyComparisonOp::Lt => comparison::lt_dyn, + PyComparisonOp::Ge => comparison::gt_eq_dyn, + PyComparisonOp::Le => comparison::lt_eq_dyn, }; - move |a: &dyn Array, b: &dyn Array| -> Box { - let ret = op_bool_arr(a, b); - Box::new(ret) as _ + move |a: &dyn Array, b: &dyn Array| -> ArrowResult { + let array = op_bool_arr(a, b)?; + Ok(Arc::new(array)) } } @@ -714,19 +671,20 @@ fn get_arrow_op(op: PyComparisonOp) -> impl Fn(&dyn Array, &dyn Array) -> Box impl Fn(&dyn Array, &dyn Scalar, &VirtualMachine) -> PyResult> { +) -> impl Fn(&dyn Array, &dyn Array, &VirtualMachine) -> PyResult { let op_bool_arr = match op { - PyComparisonOp::Eq => comparison::eq_scalar, - PyComparisonOp::Ne => comparison::neq_scalar, - PyComparisonOp::Gt => comparison::gt_scalar, - PyComparisonOp::Lt => comparison::lt_scalar, - PyComparisonOp::Ge => comparison::gt_eq_scalar, - PyComparisonOp::Le => comparison::lt_eq_scalar, + PyComparisonOp::Eq => comparison::eq_dyn, + PyComparisonOp::Ne => comparison::neq_dyn, + PyComparisonOp::Gt => comparison::gt_dyn, + PyComparisonOp::Lt => comparison::lt_dyn, + PyComparisonOp::Ge => comparison::gt_eq_dyn, + PyComparisonOp::Le => comparison::lt_eq_dyn, }; - move |a: &dyn Array, b: &dyn Scalar, _vm| -> PyResult> { - let ret = op_bool_arr(a, b); - Ok(Box::new(ret) as _) + move |a: &dyn Array, b: &dyn Array, vm| -> PyResult { + let array = + op_bool_arr(a, b).map_err(|e| vm.new_type_error(format!("scalar op error: {}", e)))?; + Ok(Arc::new(array)) } } @@ -875,7 +833,7 @@ pub fn pyobj_try_to_typed_val( // FIXME(dennis): we always consider the timestamp unit is millis, it's not correct if user define timestamp column with other units. obj.try_into_value::(vm) .ok() - .map(Timestamp::from_millis) + .map(Timestamp::new_millisecond) .map(value::Value::Timestamp) } _ => unreachable!(), diff --git a/src/script/src/table.rs b/src/script/src/table.rs index abc0279a3f..7c1570d8d1 100644 --- a/src/script/src/table.rs +++ b/src/script/src/table.rs @@ -21,12 +21,10 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, SCRIPTS_ use common_query::Output; use common_recordbatch::util as record_util; use common_telemetry::logging; -use common_time::timestamp::Timestamp; use common_time::util; -use datatypes::arrow::array::Utf8Array; use datatypes::prelude::{ConcreteDataType, ScalarVector}; use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder}; -use datatypes::vectors::{StringVector, TimestampVector, VectorRef}; +use datatypes::vectors::{StringVector, TimestampMillisecondVector, Vector, VectorRef}; use query::QueryEngineRef; use session::context::QueryContext; use snafu::{ensure, OptionExt, ResultExt}; @@ -104,19 +102,16 @@ impl ScriptsTable { // Timestamp in key part is intentionally left to 0 columns_values.insert( "timestamp".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis(0)])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[0])) as _, ); + let now = util::current_time_millis(); columns_values.insert( "gmt_created".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); columns_values.insert( "gmt_modified".to_string(), - Arc::new(TimestampVector::from_slice(&[Timestamp::from_millis( - util::current_time_millis(), - )])) as _, + Arc::new(TimestampMillisecondVector::from_slice(&[now])) as _, ); let table = self @@ -171,23 +166,21 @@ impl ScriptsTable { ensure!(!records.is_empty(), ScriptNotFoundSnafu { name }); assert_eq!(records.len(), 1); - assert_eq!(records[0].df_recordbatch.num_columns(), 1); + assert_eq!(records[0].num_columns(), 1); - let record = &records[0].df_recordbatch; - - let script_column = record - .column(0) + let script_column = records[0].column(0); + let script_column = script_column .as_any() - .downcast_ref::>() - .context(CastTypeSnafu { + .downcast_ref::() + .with_context(|| CastTypeSnafu { msg: format!( - "can't downcast {:?} array into utf8 array", - record.column(0).data_type() + "can't downcast {:?} array into string vector", + script_column.data_type() ), })?; assert_eq!(script_column.len(), 1); - Ok(script_column.value(0).to_string()) + Ok(script_column.get_data(0).unwrap().to_string()) } #[inline] @@ -216,18 +209,18 @@ fn build_scripts_schema() -> Schema { ), ColumnSchema::new( "timestamp".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ) .with_time_index(true), ColumnSchema::new( "gmt_created".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ColumnSchema::new( "gmt_modified".to_string(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), false, ), ]; diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index 04158cdbd3..3885543e98 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -277,7 +277,7 @@ impl JsonResponse { } } -async fn serve_api(Extension(api): Extension>) -> impl IntoApiResponse { +async fn serve_api(Extension(api): Extension) -> impl IntoApiResponse { Json(api) } diff --git a/src/servers/src/http/influxdb.rs b/src/servers/src/http/influxdb.rs index 06929eb341..b68cb3616f 100644 --- a/src/servers/src/http/influxdb.rs +++ b/src/servers/src/http/influxdb.rs @@ -48,12 +48,12 @@ pub async fn influxdb_write( fn parse_time_precision(value: &str) -> Result { match value { - "n" => Ok(Precision::NANOSECOND), - "u" => Ok(Precision::MICROSECOND), - "ms" => Ok(Precision::MILLISECOND), - "s" => Ok(Precision::SECOND), - "m" => Ok(Precision::MINUTE), - "h" => Ok(Precision::HOUR), + "n" => Ok(Precision::Nanosecond), + "u" => Ok(Precision::Microsecond), + "ms" => Ok(Precision::Millisecond), + "s" => Ok(Precision::Second), + "m" => Ok(Precision::Minute), + "h" => Ok(Precision::Hour), unknown => TimePrecisionSnafu { name: unknown.to_string(), } @@ -69,12 +69,12 @@ mod tests { #[test] fn test_parse_time_precision() { - assert_eq!(Precision::NANOSECOND, parse_time_precision("n").unwrap()); - assert_eq!(Precision::MICROSECOND, parse_time_precision("u").unwrap()); - assert_eq!(Precision::MILLISECOND, parse_time_precision("ms").unwrap()); - assert_eq!(Precision::SECOND, parse_time_precision("s").unwrap()); - assert_eq!(Precision::MINUTE, parse_time_precision("m").unwrap()); - assert_eq!(Precision::HOUR, parse_time_precision("h").unwrap()); + assert_eq!(Precision::Nanosecond, parse_time_precision("n").unwrap()); + assert_eq!(Precision::Microsecond, parse_time_precision("u").unwrap()); + assert_eq!(Precision::Millisecond, parse_time_precision("ms").unwrap()); + assert_eq!(Precision::Second, parse_time_precision("s").unwrap()); + assert_eq!(Precision::Minute, parse_time_precision("m").unwrap()); + assert_eq!(Precision::Hour, parse_time_precision("h").unwrap()); assert!(parse_time_precision("unknown").is_err()); } } diff --git a/src/servers/src/influxdb.rs b/src/servers/src/influxdb.rs index 0766d65843..870f6918b8 100644 --- a/src/servers/src/influxdb.rs +++ b/src/servers/src/influxdb.rs @@ -24,7 +24,7 @@ use crate::error::{Error, InfluxdbLineProtocolSnafu, InfluxdbLinesWriteSnafu}; use crate::line_writer::LineWriter; pub const INFLUXDB_TIMESTAMP_COLUMN_NAME: &str = "ts"; -pub const DEFAULT_TIME_PRECISION: Precision = Precision::NANOSECOND; +pub const DEFAULT_TIME_PRECISION: Precision = Precision::Nanosecond; pub struct InfluxdbRequest { pub precision: Option, @@ -359,11 +359,11 @@ monitor2,host=host4 cpu=66.3,memory=1029 1663840496400340003"; verify_column( &columns[3], "ts", - ColumnDataType::Timestamp, + ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, Vec::new(), Values { - ts_millis_values: vec![1663840496100, 1663840496400], + ts_millisecond_values: vec![1663840496100, 1663840496400], ..Default::default() }, ); @@ -398,11 +398,11 @@ monitor2,host=host4 cpu=66.3,memory=1029 1663840496400340003"; verify_column( &columns[2], "ts", - ColumnDataType::Timestamp, + ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, Vec::new(), Values { - ts_millis_values: vec![1663840496100, 1663840496400], + ts_millisecond_values: vec![1663840496100, 1663840496400], ..Default::default() }, ); diff --git a/src/servers/src/line_writer.rs b/src/servers/src/line_writer.rs index cbb2aff987..211e720399 100644 --- a/src/servers/src/line_writer.rs +++ b/src/servers/src/line_writer.rs @@ -18,12 +18,16 @@ use common_catalog::consts::DEFAULT_CATALOG_NAME; use common_grpc::writer::{to_ms_ts, Precision}; use common_time::timestamp::TimeUnit::Millisecond; use common_time::Timestamp; +use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; -use datatypes::types::TimestampType; -use datatypes::value::Value; -use datatypes::vectors::{VectorBuilder, VectorRef}; +use datatypes::types::{TimestampMillisecondType, TimestampType}; +use datatypes::value::{Value, ValueRef}; +use datatypes::vectors::{MutableVector, VectorRef}; +use snafu::ResultExt; use table::requests::InsertRequest; +use crate::error::VectorConversionSnafu; + type ColumnLen = usize; type ColumnName = String; @@ -32,7 +36,7 @@ pub struct LineWriter { table_name: String, expected_rows: usize, current_rows: usize, - columns_builders: HashMap, + columns_builders: HashMap, ColumnLen)>, } impl LineWriter { @@ -48,7 +52,8 @@ impl LineWriter { pub fn write_ts(&mut self, column_name: &str, value: (i64, Precision)) { let (val, precision) = value; - let datatype = ConcreteDataType::Timestamp(TimestampType { unit: Millisecond }); + let datatype = + ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType)); let ts_val = Value::Timestamp(Timestamp::new(to_ms_ts(precision, val), Millisecond)); self.write(column_name, datatype, ts_val); } @@ -104,8 +109,12 @@ impl LineWriter { fn write(&mut self, column_name: &str, datatype: ConcreteDataType, value: Value) { let or_insert = || { let rows = self.current_rows; - let mut builder = VectorBuilder::with_capacity(datatype, self.expected_rows); - (0..rows).into_iter().for_each(|_| builder.push_null()); + let mut builder = datatype.create_mutable_vector(self.expected_rows); + (0..rows) + .into_iter() + .try_for_each(|_| builder.push_value_ref(ValueRef::Null)) + .context(VectorConversionSnafu) + .unwrap(); (builder, rows) }; let (builder, column_len) = self @@ -113,7 +122,7 @@ impl LineWriter { .entry(column_name.to_string()) .or_insert_with(or_insert); - builder.push(&value); + builder.push_value_ref(value.as_value_ref()).unwrap(); *column_len += 1; } @@ -122,18 +131,22 @@ impl LineWriter { self.columns_builders .values_mut() .into_iter() - .for_each(|(builder, len)| { + .try_for_each(|(builder, len)| { if self.current_rows > *len { - builder.push(&Value::Null) + builder.push_value_ref(ValueRef::Null) + } else { + Ok(()) } - }); + }) + .context(VectorConversionSnafu) + .unwrap(); } pub fn finish(self) -> InsertRequest { let columns_values: HashMap = self .columns_builders .into_iter() - .map(|(column_name, (mut builder, _))| (column_name, builder.finish())) + .map(|(column_name, (mut builder, _))| (column_name, builder.to_vector())) .collect(); InsertRequest { catalog_name: DEFAULT_CATALOG_NAME.to_string(), @@ -158,18 +171,18 @@ mod tests { #[test] fn test_writer() { let mut writer = LineWriter::with_lines(DEFAULT_SCHEMA_NAME, "demo".to_string(), 4); - writer.write_ts("ts", (1665893727685, Precision::MILLISECOND)); + writer.write_ts("ts", (1665893727685, Precision::Millisecond)); writer.write_tag("host", "host-1"); writer.write_i64("memory", 10_i64); writer.commit(); - writer.write_ts("ts", (1665893727686, Precision::MILLISECOND)); + writer.write_ts("ts", (1665893727686, Precision::Millisecond)); writer.write_tag("host", "host-2"); writer.write_tag("region", "region-2"); writer.write_i64("memory", 9_i64); writer.commit(); - writer.write_ts("ts", (1665893727689, Precision::MILLISECOND)); + writer.write_ts("ts", (1665893727689, Precision::Millisecond)); writer.write_tag("host", "host-3"); writer.write_tag("region", "region-3"); writer.write_i64("cpu", 19_i64); @@ -195,9 +208,9 @@ mod tests { let cpu = columns.get("cpu").unwrap(); let expected: Vec = vec![ - Value::Timestamp(Timestamp::from_millis(1665893727685_i64)), - Value::Timestamp(Timestamp::from_millis(1665893727686_i64)), - Value::Timestamp(Timestamp::from_millis(1665893727689_i64)), + Value::Timestamp(Timestamp::new_millisecond(1665893727685_i64)), + Value::Timestamp(Timestamp::new_millisecond(1665893727686_i64)), + Value::Timestamp(Timestamp::new_millisecond(1665893727689_i64)), ]; assert_vector(&expected, ts); diff --git a/src/servers/src/mysql/federated.rs b/src/servers/src/mysql/federated.rs index f2f1a8caed..1736ae67fe 100644 --- a/src/servers/src/mysql/federated.rs +++ b/src/servers/src/mysql/federated.rs @@ -310,90 +310,85 @@ mod test { let output = check(query, Arc::new(QueryContext::new())); assert!(output.is_none()); - fn test(query: &str, expected: Vec<&str>) { + fn test(query: &str, expected: &str) { let output = check(query, Arc::new(QueryContext::new())); match output.unwrap() { Output::RecordBatches(r) => { - assert_eq!(r.pretty_print().lines().collect::>(), expected) + assert_eq!(&r.pretty_print().unwrap(), expected) } _ => unreachable!(), } } let query = "select version()"; - let expected = vec![ - "+-----------+", - "| version() |", - "+-----------+", - "| 8.0.26 |", - "+-----------+", - ]; + let expected = "\ ++-----------+ +| version() | ++-----------+ +| 8.0.26 | ++-----------+"; test(query, expected); let query = "SELECT @@version_comment LIMIT 1"; - let expected = vec![ - "+-------------------+", - "| @@version_comment |", - "+-------------------+", - "| Greptime |", - "+-------------------+", - ]; + let expected = "\ ++-------------------+ +| @@version_comment | ++-------------------+ +| Greptime | ++-------------------+"; test(query, expected); // variables let query = "select @@tx_isolation, @@session.tx_isolation"; - let expected = vec![ - "+-----------------+------------------------+", - "| @@tx_isolation | @@session.tx_isolation |", - "+-----------------+------------------------+", - "| REPEATABLE-READ | REPEATABLE-READ |", - "+-----------------+------------------------+", - ]; + let expected = "\ ++-----------------+------------------------+ +| @@tx_isolation | @@session.tx_isolation | ++-----------------+------------------------+ +| REPEATABLE-READ | REPEATABLE-READ | ++-----------------+------------------------+"; test(query, expected); // complex variables let query = "/* mysql-connector-java-8.0.17 (Revision: 16a712ddb3f826a1933ab42b0039f7fb9eebc6ec) */SELECT @@session.auto_increment_increment AS auto_increment_increment, @@character_set_client AS character_set_client, @@character_set_connection AS character_set_connection, @@character_set_results AS character_set_results, @@character_set_server AS character_set_server, @@collation_server AS collation_server, @@collation_connection AS collation_connection, @@init_connect AS init_connect, @@interactive_timeout AS interactive_timeout, @@license AS license, @@lower_case_table_names AS lower_case_table_names, @@max_allowed_packet AS max_allowed_packet, @@net_write_timeout AS net_write_timeout, @@performance_schema AS performance_schema, @@sql_mode AS sql_mode, @@system_time_zone AS system_time_zone, @@time_zone AS time_zone, @@transaction_isolation AS transaction_isolation, @@wait_timeout AS wait_timeout;"; - let expected = vec![ - "+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+", - "| auto_increment_increment | character_set_client | character_set_connection | character_set_results | character_set_server | collation_server | collation_connection | init_connect | interactive_timeout | license | lower_case_table_names | max_allowed_packet | net_write_timeout | performance_schema | sql_mode | system_time_zone | time_zone | transaction_isolation | wait_timeout; |", - "+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+", - "| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31536000 | 0 | 0 | 134217728 | 31536000 | 0 | 0 | UTC | UTC | REPEATABLE-READ | 31536000 |", - "+--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+", - ]; + let expected = "\ ++--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+ +| auto_increment_increment | character_set_client | character_set_connection | character_set_results | character_set_server | collation_server | collation_connection | init_connect | interactive_timeout | license | lower_case_table_names | max_allowed_packet | net_write_timeout | performance_schema | sql_mode | system_time_zone | time_zone | transaction_isolation | wait_timeout; | ++--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+ +| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31536000 | 0 | 0 | 134217728 | 31536000 | 0 | 0 | UTC | UTC | REPEATABLE-READ | 31536000 | ++--------------------------+----------------------+--------------------------+-----------------------+----------------------+------------------+----------------------+--------------+---------------------+---------+------------------------+--------------------+-------------------+--------------------+----------+------------------+-----------+-----------------------+---------------+"; test(query, expected); let query = "show variables"; - let expected = vec![ - "+---------------+-------+", - "| Variable_name | Value |", - "+---------------+-------+", - "| | |", - "+---------------+-------+", - ]; + let expected = "\ ++---------------+-------+ +| Variable_name | Value | ++---------------+-------+ +| | | ++---------------+-------+"; test(query, expected); let query = "show variables like 'lower_case_table_names'"; - let expected = vec![ - "+------------------------+-------+", - "| Variable_name | Value |", - "+------------------------+-------+", - "| lower_case_table_names | 0 |", - "+------------------------+-------+", - ]; + let expected = "\ ++------------------------+-------+ +| Variable_name | Value | ++------------------------+-------+ +| lower_case_table_names | 0 | ++------------------------+-------+"; test(query, expected); let query = "show collation"; - let expected = vec!["++", "++"]; // empty + let expected = "\ +++ +++"; // empty test(query, expected); let query = "SELECT TIMEDIFF(NOW(), UTC_TIMESTAMP())"; - let expected = vec![ - "+----------------------------------+", - "| TIMEDIFF(NOW(), UTC_TIMESTAMP()) |", - "+----------------------------------+", - "| 00:00:00 |", - "+----------------------------------+", - ]; + let expected = "\ ++----------------------------------+ +| TIMEDIFF(NOW(), UTC_TIMESTAMP()) | ++----------------------------------+ +| 00:00:00 | ++----------------------------------+"; test(query, expected); } } diff --git a/src/servers/src/mysql/server.rs b/src/servers/src/mysql/server.rs index 79a3bd3a66..3bec0ebbbc 100644 --- a/src/servers/src/mysql/server.rs +++ b/src/servers/src/mysql/server.rs @@ -41,7 +41,7 @@ const DEFAULT_RESULT_SET_WRITE_BUFFER_SIZE: usize = 100 * 1024; pub struct MysqlServer { base_server: BaseTcpServer, query_handler: SqlQueryHandlerRef, - tls: Arc, + tls: TlsOption, user_provider: Option, } @@ -49,7 +49,7 @@ impl MysqlServer { pub fn create_server( query_handler: SqlQueryHandlerRef, io_runtime: Arc, - tls: Arc, + tls: TlsOption, user_provider: Option, ) -> Box { Box::new(MysqlServer { diff --git a/src/servers/src/opentsdb/codec.rs b/src/servers/src/opentsdb/codec.rs index 260a206fe5..49fccc4848 100644 --- a/src/servers/src/opentsdb/codec.rs +++ b/src/servers/src/opentsdb/codec.rs @@ -132,7 +132,7 @@ impl DataPoint { let mut line_writer = LineWriter::with_lines(DEFAULT_SCHEMA_NAME, self.metric.clone(), 1); line_writer.write_ts( OPENTSDB_TIMESTAMP_COLUMN_NAME, - (self.ts_millis(), Precision::MILLISECOND), + (self.ts_millis(), Precision::Millisecond), ); line_writer.write_f64(OPENTSDB_VALUE_COLUMN_NAME, self.value); @@ -152,11 +152,11 @@ impl DataPoint { let ts_column = Column { column_name: OPENTSDB_TIMESTAMP_COLUMN_NAME.to_string(), values: Some(column::Values { - ts_millis_values: vec![self.ts_millis], + ts_millisecond_values: vec![self.ts_millis], ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; columns.push(ts_column); @@ -336,7 +336,7 @@ mod test { assert_eq!(columns[0].column_name, OPENTSDB_TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000] ); diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index 36dbd80d33..3d9b11c077 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -237,7 +237,7 @@ mod test { ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true), ColumnSchema::new( "timestamps", - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), true, ), ColumnSchema::new("dates", ConcreteDataType::date_datatype(), true), diff --git a/src/servers/src/postgres/server.rs b/src/servers/src/postgres/server.rs index d2b8844992..5003af92c5 100644 --- a/src/servers/src/postgres/server.rs +++ b/src/servers/src/postgres/server.rs @@ -37,14 +37,14 @@ pub struct PostgresServer { base_server: BaseTcpServer, auth_handler: Arc, query_handler: Arc, - tls: Arc, + tls: TlsOption, } impl PostgresServer { /// Creates a new Postgres server with provided query_handler and async runtime pub fn new( query_handler: SqlQueryHandlerRef, - tls: Arc, + tls: TlsOption, io_runtime: Arc, user_provider: Option, ) -> PostgresServer { diff --git a/src/servers/src/prometheus.rs b/src/servers/src/prometheus.rs index 1c2b035ec0..80d9db0b74 100644 --- a/src/servers/src/prometheus.rs +++ b/src/servers/src/prometheus.rs @@ -22,7 +22,7 @@ use api::prometheus::remote::{Label, Query, Sample, TimeSeries, WriteRequest}; use api::v1::codec::SelectResult; use api::v1::column::SemanticType; use api::v1::{column, Column, ColumnDataType, InsertExpr}; -use common_grpc::writer::Precision::MILLISECOND; +use common_grpc::writer::Precision::Millisecond; use openmetrics_parser::{MetricsExposition, PrometheusType, PrometheusValue}; use snafu::{OptionExt, ResultExt}; use snap::raw::{Decoder, Encoder}; @@ -279,7 +279,7 @@ pub fn select_result_to_timeseries( timestamp: ts_column .values .as_ref() - .map(|vs| vs.ts_millis_values[ts_row]) + .map(|vs| vs.ts_millisecond_values[ts_row]) .unwrap_or(0i64), }; @@ -325,7 +325,7 @@ fn timeseries_to_insert_request(db: &str, mut timeseries: TimeSeries) -> Result< let ts_millis = sample.timestamp; let val = sample.value; - line_writer.write_ts(TIMESTAMP_COLUMN_NAME, (ts_millis, MILLISECOND)); + line_writer.write_ts(TIMESTAMP_COLUMN_NAME, (ts_millis, Millisecond)); line_writer.write_f64(VALUE_COLUMN_NAME, val); labels @@ -368,11 +368,11 @@ fn timeseries_to_insert_expr(database: &str, mut timeseries: TimeSeries) -> Resu let ts_column = Column { column_name: TIMESTAMP_COLUMN_NAME.to_string(), values: Some(column::Values { - ts_millis_values: samples.iter().map(|x| x.timestamp).collect(), + ts_millisecond_values: samples.iter().map(|x| x.timestamp).collect(), ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; columns.push(ts_column); @@ -686,7 +686,7 @@ mod tests { assert_eq!(columns[0].column_name, TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000, 2000] ); @@ -712,7 +712,7 @@ mod tests { assert_eq!(columns[0].column_name, TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000, 2000] ); @@ -743,7 +743,7 @@ mod tests { assert_eq!(columns[0].column_name, TIMESTAMP_COLUMN_NAME); assert_eq!( - columns[0].values.as_ref().unwrap().ts_millis_values, + columns[0].values.as_ref().unwrap().ts_millisecond_values, vec![1000, 2000, 3000] ); @@ -773,7 +773,7 @@ mod tests { Column { column_name: TIMESTAMP_COLUMN_NAME.to_string(), values: Some(column::Values { - ts_millis_values: vec![1000, 2000], + ts_millisecond_values: vec![1000, 2000], ..Default::default() }), ..Default::default() diff --git a/src/servers/tests/mysql/mysql_server_test.rs b/src/servers/tests/mysql/mysql_server_test.rs index fc0ef36f2a..2d1aac91a9 100644 --- a/src/servers/tests/mysql/mysql_server_test.rs +++ b/src/servers/tests/mysql/mysql_server_test.rs @@ -33,7 +33,7 @@ use table::test_util::MemTable; use crate::create_testing_sql_query_handler; use crate::mysql::{all_datatype_testing_data, MysqlTextRow, TestingData}; -fn create_mysql_server(table: MemTable, tls: Arc) -> Result> { +fn create_mysql_server(table: MemTable, tls: TlsOption) -> Result> { let query_handler = create_testing_sql_query_handler(table); let io_runtime = Arc::new( RuntimeBuilder::default() @@ -125,7 +125,7 @@ async fn test_shutdown_mysql_server() -> Result<()> { async fn test_query_all_datatypes() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption::default()); + let server_tls = TlsOption::default(); let client_tls = false; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -134,11 +134,11 @@ async fn test_query_all_datatypes() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_server_prefer_secure_client_plain() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Prefer, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = false; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -147,11 +147,11 @@ async fn test_server_prefer_secure_client_plain() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_server_prefer_secure_client_secure() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Prefer, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = true; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -160,11 +160,11 @@ async fn test_server_prefer_secure_client_secure() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn test_server_require_secure_client_secure() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = true; do_test_query_all_datatypes(server_tls, client_tls).await?; @@ -173,11 +173,11 @@ async fn test_server_require_secure_client_secure() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_server_required_secure_client_plain() -> Result<()> { - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = false; @@ -202,7 +202,7 @@ async fn test_server_required_secure_client_plain() -> Result<()> { Ok(()) } -async fn do_test_query_all_datatypes(server_tls: Arc, client_tls: bool) -> Result<()> { +async fn do_test_query_all_datatypes(server_tls: TlsOption, client_tls: bool) -> Result<()> { common_telemetry::init_default_ut_logging(); let TestingData { column_schemas, diff --git a/src/servers/tests/postgres/mod.rs b/src/servers/tests/postgres/mod.rs index f7cdec12b2..5653251c0d 100644 --- a/src/servers/tests/postgres/mod.rs +++ b/src/servers/tests/postgres/mod.rs @@ -36,7 +36,7 @@ use crate::create_testing_sql_query_handler; fn create_postgres_server( table: MemTable, check_pwd: bool, - tls: Arc, + tls: TlsOption, ) -> Result> { let query_handler = create_testing_sql_query_handler(table); let io_runtime = Arc::new( @@ -194,11 +194,11 @@ async fn test_query_pg_concurrently() -> Result<()> { async fn test_server_secure_prefer_client_plain() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Prefer, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = false; do_simple_query(server_tls, client_tls).await?; @@ -209,11 +209,11 @@ async fn test_server_secure_prefer_client_plain() -> Result<()> { async fn test_server_secure_require_client_plain() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let server_port = start_test_server(server_tls).await?; let r = create_plain_connection(server_port, false).await; assert!(r.is_err()); @@ -224,11 +224,11 @@ async fn test_server_secure_require_client_plain() -> Result<()> { async fn test_server_secure_require_client_secure() -> Result<()> { common_telemetry::init_default_ut_logging(); - let server_tls = Arc::new(TlsOption { + let server_tls = TlsOption { mode: servers::tls::TlsMode::Require, cert_path: "tests/ssl/server.crt".to_owned(), key_path: "tests/ssl/server.key".to_owned(), - }); + }; let client_tls = true; do_simple_query(server_tls, client_tls).await?; @@ -237,7 +237,7 @@ async fn test_server_secure_require_client_secure() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_using_db() -> Result<()> { - let server_port = start_test_server(Arc::new(TlsOption::default())).await?; + let server_port = start_test_server(TlsOption::default()).await?; let client = create_connection_with_given_db(server_port, "testdb") .await @@ -253,7 +253,7 @@ async fn test_using_db() -> Result<()> { Ok(()) } -async fn start_test_server(server_tls: Arc) -> Result { +async fn start_test_server(server_tls: TlsOption) -> Result { common_telemetry::init_default_ut_logging(); let table = MemTable::default_numbers_table(); let pg_server = create_postgres_server(table, false, server_tls)?; @@ -262,7 +262,7 @@ async fn start_test_server(server_tls: Arc) -> Result { Ok(server_addr.port()) } -async fn do_simple_query(server_tls: Arc, client_tls: bool) -> Result<()> { +async fn do_simple_query(server_tls: TlsOption, client_tls: bool) -> Result<()> { let server_port = start_test_server(server_tls).await?; if !client_tls { diff --git a/src/sql/Cargo.toml b/src/sql/Cargo.toml index 6f7f40b017..ebdd0f172b 100644 --- a/src/sql/Cargo.toml +++ b/src/sql/Cargo.toml @@ -15,4 +15,4 @@ itertools = "0.10" mito = { path = "../mito" } once_cell = "1.10" snafu = { version = "0.7", features = ["backtraces"] } -sqlparser = "0.15.0" +sqlparser = "0.26" diff --git a/src/sql/src/ast.rs b/src/sql/src/ast.rs index 11636df8c0..7388b9453c 100644 --- a/src/sql/src/ast.rs +++ b/src/sql/src/ast.rs @@ -14,5 +14,5 @@ pub use sqlparser::ast::{ ColumnDef, ColumnOption, ColumnOptionDef, DataType, Expr, Function, FunctionArg, - FunctionArgExpr, Ident, ObjectName, SqlOption, TableConstraint, Value, + FunctionArgExpr, Ident, ObjectName, SqlOption, TableConstraint, TimezoneInfo, Value, }; diff --git a/src/sql/src/parser.rs b/src/sql/src/parser.rs index 254982e88e..3a14fb0666 100644 --- a/src/sql/src/parser.rs +++ b/src/sql/src/parser.rs @@ -505,11 +505,7 @@ mod tests { assert_matches!( &stmts[0], Statement::ShowTables(ShowTables { - kind: ShowKind::Where(sqlparser::ast::Expr::BinaryOp { - left: _, - right: _, - op: sqlparser::ast::BinaryOperator::Like, - }), + kind: ShowKind::Where(sqlparser::ast::Expr::Like { .. }), database: None, }) ); @@ -522,11 +518,7 @@ mod tests { assert_matches!( &stmts[0], Statement::ShowTables(ShowTables { - kind: ShowKind::Where(sqlparser::ast::Expr::BinaryOp { - left: _, - right: _, - op: sqlparser::ast::BinaryOperator::Like, - }), + kind: ShowKind::Where(sqlparser::ast::Expr::Like { .. }), database: Some(_), }) ); @@ -543,11 +535,12 @@ mod tests { distinct: false, top: None, projection: vec![sqlparser::ast::SelectItem::Wildcard], + into: None, from: vec![sqlparser::ast::TableWithJoins { relation: sqlparser::ast::TableFactor::Table { name: sqlparser::ast::ObjectName(vec![sqlparser::ast::Ident::new("foo")]), alias: None, - args: vec![], + args: None, with_hints: vec![], }, joins: vec![], @@ -559,11 +552,12 @@ mod tests { distribute_by: vec![], sort_by: vec![], having: None, + qualify: None, }; let sp_statement = SpStatement::Query(Box::new(SpQuery { with: None, - body: sqlparser::ast::SetExpr::Select(Box::new(select)), + body: Box::new(sqlparser::ast::SetExpr::Select(Box::new(select))), order_by: vec![], limit: None, offset: None, @@ -576,6 +570,7 @@ mod tests { analyze: false, verbose: false, statement: Box::new(sp_statement), + format: None, }) .unwrap(); diff --git a/src/sql/src/parsers/create_parser.rs b/src/sql/src/parsers/create_parser.rs index 0ddb8e6ff5..12dc8fa58e 100644 --- a/src/sql/src/parsers/create_parser.rs +++ b/src/sql/src/parsers/create_parser.rs @@ -253,7 +253,7 @@ impl<'a> ParserContext<'a> { .parse_column_def() .context(SyntaxSnafu { sql: self.sql })?; - if !matches!(column.data_type, DataType::Timestamp) + if !matches!(column.data_type, DataType::Timestamp(_)) || matches!(self.parser.peek_token(), Token::Comma) { columns.push(column); diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs index bcdc099265..ba8397ca01 100644 --- a/src/sql/src/statements.rs +++ b/src/sql/src/statements.rs @@ -21,12 +21,12 @@ pub mod insert; pub mod query; pub mod show; pub mod statement; - use std::str::FromStr; use api::helper::ColumnDataTypeWrapper; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_time::Timestamp; +use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema}; use datatypes::types::DateTimeType; @@ -79,7 +79,7 @@ fn parse_string_to_value( data_type: &ConcreteDataType, ) -> Result { ensure!( - data_type.stringifiable(), + data_type.is_stringifiable(), ColumnTypeMismatchSnafu { column_name, expect: data_type.clone(), @@ -112,8 +112,8 @@ fn parse_string_to_value( ConcreteDataType::Timestamp(t) => { if let Ok(ts) = Timestamp::from_str(&s) { Ok(Value::Timestamp(Timestamp::new( - ts.convert_to(t.unit), - t.unit, + ts.convert_to(t.unit()), + t.unit(), ))) } else { ParseSqlValueSnafu { @@ -301,7 +301,10 @@ pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result Ok(ConcreteDataType::date_datatype()), SqlDataType::Custom(obj_name) => match &obj_name.0[..] { [type_name] => { - if type_name.value.eq_ignore_ascii_case(DateTimeType::name()) { + if type_name + .value + .eq_ignore_ascii_case(DateTimeType::default().name()) + { Ok(ConcreteDataType::datetime_datatype()) } else { error::SqlTypeNotSupportedSnafu { @@ -315,7 +318,7 @@ pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result Ok(ConcreteDataType::timestamp_millis_datatype()), + SqlDataType::Timestamp(_) => Ok(ConcreteDataType::timestamp_millisecond_datatype()), _ => error::SqlTypeNotSupportedSnafu { t: data_type.clone(), } @@ -333,7 +336,7 @@ mod tests { use datatypes::value::OrderedFloat; use super::*; - use crate::ast::{DataType, Ident}; + use crate::ast::{Ident, TimezoneInfo}; use crate::statements::ColumnOption; fn check_type(sql_type: SqlDataType, data_type: ConcreteDataType) { @@ -373,8 +376,8 @@ mod tests { ConcreteDataType::datetime_datatype(), ); check_type( - SqlDataType::Timestamp, - ConcreteDataType::timestamp_millis_datatype(), + SqlDataType::Timestamp(TimezoneInfo::None), + ConcreteDataType::timestamp_millisecond_datatype(), ); } @@ -419,9 +422,13 @@ mod tests { let sql_val = SqlValue::Boolean(true); let v = sql_value_to_value("a", &ConcreteDataType::float64_datatype(), &sql_val); assert!(v.is_err()); - assert!(format!("{:?}", v).contains( - "column_name: \"a\", expect: Float64(Float64), actual: Boolean(BooleanType)" - )); + assert!( + format!("{:?}", v).contains( + "column_name: \"a\", expect: Float64(Float64Type), actual: Boolean(BooleanType)" + ), + "v is {:?}", + v + ); } #[test] @@ -471,7 +478,7 @@ mod tests { match parse_string_to_value( "timestamp_col", "2022-02-22T00:01:01+08:00".to_string(), - &ConcreteDataType::timestamp_millis_datatype(), + &ConcreteDataType::timestamp_millisecond_datatype(), ) .unwrap() { @@ -570,7 +577,7 @@ mod tests { // test basic let column_def = ColumnDef { name: "col".into(), - data_type: DataType::Double, + data_type: SqlDataType::Double, collation: None, options: vec![], }; @@ -585,7 +592,7 @@ mod tests { // test not null let column_def = ColumnDef { name: "col".into(), - data_type: DataType::Double, + data_type: SqlDataType::Double, collation: None, options: vec![ColumnOptionDef { name: None, diff --git a/src/sql/src/statements/insert.rs b/src/sql/src/statements/insert.rs index 410c0d09cb..f105648ea8 100644 --- a/src/sql/src/statements/insert.rs +++ b/src/sql/src/statements/insert.rs @@ -49,7 +49,7 @@ impl Insert { pub fn values(&self) -> Result>> { let values = match &self.inner { - Statement::Insert { source, .. } => match &source.body { + Statement::Insert { source, .. } => match &*source.body { SetExpr::Values(Values(exprs)) => sql_exprs_to_values(exprs)?, _ => unreachable!(), }, diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 94d106f699..9c107d3e64 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -6,7 +6,7 @@ license = "Apache-2.0" [dependencies] arc-swap = "1.0" -arrow-format = { version = "0.4", features = ["ipc"] } +async-compat = "0.2" async-stream = "0.3" async-trait = "0.1" bytes = "1.1" @@ -22,6 +22,7 @@ futures-util = "0.3" lazy_static = "1.4" object-store = { path = "../object-store" } paste = "1.0" +parquet = { version = "26", features = ["async"] } planus = "0.2" prost = "0.11" regex = "1.5" diff --git a/src/storage/benches/memtable/mod.rs b/src/storage/benches/memtable/mod.rs index 462c3edc28..eb12b11ab0 100644 --- a/src/storage/benches/memtable/mod.rs +++ b/src/storage/benches/memtable/mod.rs @@ -20,9 +20,11 @@ pub mod util; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use common_time::Timestamp; use datatypes::prelude::ScalarVectorBuilder; -use datatypes::vectors::{StringVectorBuilder, TimestampVectorBuilder, UInt64VectorBuilder}; +use datatypes::timestamp::TimestampMillisecond; +use datatypes::vectors::{ + StringVectorBuilder, TimestampMillisecondVectorBuilder, UInt64VectorBuilder, +}; use rand::distributions::Alphanumeric; use rand::prelude::ThreadRng; use rand::Rng; @@ -69,11 +71,11 @@ fn kvs_with_index( values: &[(Option, String)], ) -> KeyValues { let mut key_builders = ( - TimestampVectorBuilder::with_capacity(keys.len()), + TimestampMillisecondVectorBuilder::with_capacity(keys.len()), UInt64VectorBuilder::with_capacity(keys.len()), ); for key in keys { - key_builders.0.push(Some(Timestamp::from_millis(key.0))); + key_builders.0.push(Some(TimestampMillisecond::from(key.0))); key_builders.1.push(Some(key.1)); } let row_keys = vec![ diff --git a/src/storage/benches/memtable/util/regiondesc_util.rs b/src/storage/benches/memtable/util/regiondesc_util.rs index 51dcb8795a..e8f71c71bd 100644 --- a/src/storage/benches/memtable/util/regiondesc_util.rs +++ b/src/storage/benches/memtable/util/regiondesc_util.rs @@ -34,7 +34,7 @@ impl RegionDescBuilder { ColumnDescriptorBuilder::new( 1, TIMESTAMP_NAME, - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .is_nullable(false) .build() diff --git a/src/storage/benches/wal/util/mod.rs b/src/storage/benches/wal/util/mod.rs index 47bd3766c0..477297074a 100644 --- a/src/storage/benches/wal/util/mod.rs +++ b/src/storage/benches/wal/util/mod.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use datatypes::prelude::ScalarVector; use datatypes::type_id::LogicalTypeId; use datatypes::vectors::{ - BooleanVector, Float64Vector, StringVector, TimestampVector, UInt64Vector, + BooleanVector, Float64Vector, StringVector, TimestampMillisecondVector, UInt64Vector, }; use rand::Rng; use storage::proto; @@ -31,7 +31,7 @@ pub fn new_test_batch() -> WriteBatch { &[ ("k1", LogicalTypeId::UInt64, false), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Boolean, true), ("4", LogicalTypeId::Float64, false), ("5", LogicalTypeId::Float64, false), @@ -71,7 +71,7 @@ pub fn gen_new_batch_and_types(putdate_nums: usize) -> (WriteBatch, Vec) { rng.fill(&mut fvs[..]); let intv = Arc::new(UInt64Vector::from_slice(&intvs)); let boolv = Arc::new(BooleanVector::from(boolvs.to_vec())); - let tsv = Arc::new(TimestampVector::from_values(tsvs)); + let tsv = Arc::new(TimestampMillisecondVector::from_values(tsvs)); let fvs = Arc::new(Float64Vector::from_slice(&fvs)); let svs = Arc::new(StringVector::from_slice(&svs)); let mut put_data = PutData::default(); diff --git a/src/storage/proto/write_batch.proto b/src/storage/proto/write_batch.proto index 6f0ec4a388..ed1813aa55 100644 --- a/src/storage/proto/write_batch.proto +++ b/src/storage/proto/write_batch.proto @@ -61,7 +61,12 @@ enum DataType { FLOAT64 = 11; STRING = 12; BINARY = 13; - TIMESTAMP = 14; + DATE = 14; + DATETIME = 15; + TIMESTAMP_SECOND = 16; + TIMESTAMP_MILLISECOND = 17; + TIMESTAMP_MICROSECOND = 18; + TIMESTAMP_NANOSECOND = 19; } message Values { @@ -81,5 +86,10 @@ message Values { repeated bool bool_values = 11; repeated bytes binary_values = 12; repeated string string_values = 13; - repeated int64 timestamp_values = 14; + repeated int32 date_values = 14; + repeated int64 datetime_values = 15; + repeated int64 ts_second_values = 16; + repeated int64 ts_millisecond_values = 17; + repeated int64 ts_microsecond_values = 18; + repeated int64 ts_nanosecond_values = 19; } diff --git a/src/storage/src/error.rs b/src/storage/src/error.rs index bc86199f23..53c34f8ecc 100644 --- a/src/storage/src/error.rs +++ b/src/storage/src/error.rs @@ -18,7 +18,6 @@ use std::str::Utf8Error; use common_error::prelude::*; use datatypes::arrow; -use datatypes::arrow::error::ArrowError; use datatypes::prelude::ConcreteDataType; use serde_json::error::Error as JsonError; use store_api::manifest::action::ProtocolVersion; @@ -54,10 +53,16 @@ pub enum Error { #[snafu(display("Failed to write parquet file, source: {}", source))] WriteParquet { - source: arrow::error::ArrowError, + source: parquet::errors::ParquetError, backtrace: Backtrace, }, + #[snafu(display("Failed to create RecordBatch from vectors, source: {}", source))] + NewRecordBatch { + backtrace: Backtrace, + source: arrow::error::ArrowError, + }, + #[snafu(display("Fail to read object from path: {}, source: {}", path, source))] ReadObject { path: String, @@ -180,7 +185,7 @@ pub enum Error { #[snafu(display("Failed to read Parquet file: {}, source: {}", file, source))] ReadParquet { file: String, - source: ArrowError, + source: parquet::errors::ParquetError, backtrace: Backtrace, }, @@ -396,7 +401,8 @@ impl ErrorExt for Error { | AlterMetadata { .. } | CompatRead { .. } | CreateDefaultToRead { .. } - | NoDefaultToRead { .. } => StatusCode::Unexpected, + | NoDefaultToRead { .. } + | NewRecordBatch { .. } => StatusCode::Unexpected, FlushIo { .. } | WriteParquet { .. } @@ -484,14 +490,14 @@ mod tests { #[test] pub fn test_arrow_error() { fn throw_arrow_error() -> std::result::Result<(), ArrowError> { - Err(ArrowError::ExternalFormat("Lorem ipsum".to_string())) + Err(ArrowError::IoError("Lorem ipsum".to_string())) } let error = throw_arrow_error() - .context(WriteParquetSnafu) + .context(NewRecordBatchSnafu) .err() .unwrap(); - assert_eq!(StorageUnavailable, error.status_code()); + assert_eq!(Unexpected, error.status_code()); assert!(error.backtrace_opt().is_some()); } } diff --git a/src/storage/src/manifest/action.rs b/src/storage/src/manifest/action.rs index 6c53aaf941..690fe679c4 100644 --- a/src/storage/src/manifest/action.rs +++ b/src/storage/src/manifest/action.rs @@ -30,7 +30,7 @@ use crate::metadata::{ColumnFamilyMetadata, ColumnMetadata, VersionNumber}; use crate::sst::FileMeta; /// Minimal data that could be used to persist and recover [RegionMetadata](crate::metadata::RegionMetadata). -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct RawRegionMetadata { pub id: RegionId, pub name: String, @@ -40,7 +40,7 @@ pub struct RawRegionMetadata { } /// Minimal data that could be used to persist and recover [ColumnsMetadata](crate::metadata::ColumnsMetadata). -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct RawColumnsMetadata { pub columns: Vec, pub row_key_end: usize, @@ -55,7 +55,7 @@ pub struct RawColumnFamiliesMetadata { pub column_families: Vec, } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct RegionChange { /// The committed sequence of the region when this change happens. So the /// data with sequence **greater than** this sequence would use the new @@ -78,7 +78,7 @@ pub struct RegionEdit { pub files_to_remove: Vec, } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub enum RegionMetaAction { Protocol(ProtocolAction), Change(RegionChange), @@ -86,7 +86,7 @@ pub enum RegionMetaAction { Edit(RegionEdit), } -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] pub struct RegionMetaActionList { pub actions: Vec, pub prev_version: ManifestVersion, diff --git a/src/storage/src/memtable/btree.rs b/src/storage/src/memtable/btree.rs index a06c6ee5a7..e1da00a33d 100644 --- a/src/storage/src/memtable/btree.rs +++ b/src/storage/src/memtable/btree.rs @@ -18,11 +18,10 @@ use std::ops::Bound; use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; use std::sync::{Arc, RwLock}; +use datatypes::data_type::DataType; use datatypes::prelude::*; use datatypes::value::Value; -use datatypes::vectors::{ - UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, VectorBuilder, -}; +use datatypes::vectors::{UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder}; use store_api::storage::{OpType, SequenceNumber}; use crate::error::Result; @@ -441,7 +440,7 @@ fn rows_to_vectors, T: RowsProvider>( let row_num = provider.row_num(); let mut builders = Vec::with_capacity(column_num); for data_type in data_types { - builders.push(VectorBuilder::with_capacity(data_type, row_num)); + builders.push(data_type.create_mutable_vector(row_num)); } let mut vectors = Vec::with_capacity(column_num); @@ -453,10 +452,13 @@ fn rows_to_vectors, T: RowsProvider>( for row_idx in 0..row_num { let row = provider.row_by_index(row_idx); let value = &row[col_idx]; - builder.push(value); + builder + .as_mut() + .push_value_ref(value.as_value_ref()) + .unwrap(); } - vectors.push(builder.finish()); + vectors.push(builder.to_vector()); } vectors diff --git a/src/storage/src/memtable/inserter.rs b/src/storage/src/memtable/inserter.rs index 6f0ea70b0f..a876f7c4c4 100644 --- a/src/storage/src/memtable/inserter.rs +++ b/src/storage/src/memtable/inserter.rs @@ -140,7 +140,7 @@ mod tests { use common_time::timestamp::Timestamp; use datatypes::type_id::LogicalTypeId; use datatypes::value::Value; - use datatypes::vectors::{Int64Vector, TimestampVector}; + use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use store_api::storage::{PutOperation, WriteRequest}; use super::*; @@ -153,7 +153,7 @@ mod tests { fn new_test_write_batch() -> WriteBatch { write_batch_util::new_write_batch( &[ - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("value", LogicalTypeId::Int64, true), ], Some(0), @@ -162,7 +162,7 @@ mod tests { fn new_region_schema() -> RegionSchemaRef { let desc = RegionDescBuilder::new("test") - .timestamp(("ts", LogicalTypeId::Timestamp, false)) + .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) .push_value_column(("value", LogicalTypeId::Int64, true)) .enable_version_column(false) .build(); @@ -173,9 +173,9 @@ mod tests { fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option)]) { let mut put_data = PutData::with_num_columns(2); - let ts = TimestampVector::from_values(data.iter().map(|v| v.0)); + let ts = TimestampMillisecondVector::from_values(data.iter().map(|v| v.0)); put_data.add_key_column("ts", Arc::new(ts)).unwrap(); - let value = Int64Vector::from_iter(data.iter().map(|v| v.1)); + let value = Int64Vector::from(data.iter().map(|v| v.1).collect::>()); put_data.add_value_column("value", Arc::new(value)).unwrap(); batch.put(put_data).unwrap(); @@ -195,7 +195,10 @@ mod tests { for i in 0..row_num { let ts = batch.column(0).get(i); let v = batch.column(1).get(i); - assert_eq!(Value::Timestamp(Timestamp::from_millis(data[index].0)), ts); + assert_eq!( + Value::Timestamp(Timestamp::new_millisecond(data[index].0)), + ts + ); assert_eq!(Value::from(data[index].1), v); assert_eq!(Value::from(sequence), batch.column(2).get(i)); diff --git a/src/storage/src/memtable/tests.rs b/src/storage/src/memtable/tests.rs index d51cc844ca..2ede68cc0f 100644 --- a/src/storage/src/memtable/tests.rs +++ b/src/storage/src/memtable/tests.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_time::timestamp::Timestamp; -use datatypes::arrow; -use datatypes::arrow::array::{Int64Array, PrimitiveArray, UInt64Array, UInt8Array}; use datatypes::prelude::*; +use datatypes::timestamp::TimestampMillisecond; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{TimestampVectorBuilder, UInt64VectorBuilder}; +use datatypes::vectors::{ + TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, +}; use super::*; use crate::metadata::RegionMetadata; @@ -43,13 +44,13 @@ fn kvs_for_test_with_index( sequence: SequenceNumber, op_type: OpType, start_index_in_batch: usize, - keys: &[(Timestamp, u64)], + keys: &[(TimestampMillisecond, u64)], values: &[(Option, Option)], ) -> KeyValues { assert_eq!(keys.len(), values.len()); let mut key_builders = ( - TimestampVectorBuilder::with_capacity(keys.len()), + TimestampMillisecondVectorBuilder::with_capacity(keys.len()), UInt64VectorBuilder::with_capacity(keys.len()), ); for key in keys { @@ -91,7 +92,7 @@ fn kvs_for_test_with_index( fn kvs_for_test( sequence: SequenceNumber, op_type: OpType, - keys: &[(Timestamp, u64)], + keys: &[(TimestampMillisecond, u64)], values: &[(Option, Option)], ) -> KeyValues { kvs_for_test_with_index(sequence, op_type, 0, keys, values) @@ -104,7 +105,8 @@ pub fn write_kvs( keys: &[(i64, u64)], values: &[(Option, Option)], ) { - let keys: Vec<(Timestamp, u64)> = keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); + let keys: Vec<(TimestampMillisecond, u64)> = + keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); let kvs = kvs_for_test(sequence, op_type, &keys, values); @@ -126,7 +128,8 @@ fn check_iter_content( op_types: &[OpType], values: &[(Option, Option)], ) { - let keys: Vec<(Timestamp, u64)> = keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); + let keys: Vec<(TimestampMillisecond, u64)> = + keys.iter().map(|(l, r)| ((*l).into(), *r)).collect(); let mut index = 0; for batch in iter { @@ -239,7 +242,7 @@ fn write_iter_memtable_case(ctx: &TestContext) { ); // 9 key value pairs (6 + 3). - assert_eq!(288, ctx.memtable.bytes_allocated()); + assert_eq!(704, ctx.memtable.bytes_allocated()); let batch_sizes = [1, 4, 8, consts::READ_BATCH_SIZE]; for batch_size in batch_sizes { @@ -576,22 +579,16 @@ fn test_memtable_projection() { assert!(iter.next().is_none()); assert_eq!(5, batch.num_columns()); - let k0 = Int64Array::from_slice(&[1000, 1001, 1002]); - let k0 = PrimitiveArray::new( - arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), - k0.values().clone(), - k0.validity().cloned(), - ); + let k0 = Arc::new(TimestampMillisecondVector::from_slice(&[1000, 1001, 1002])) as VectorRef; + let k1 = Arc::new(UInt64Vector::from_slice(&[0, 1, 2])) as VectorRef; + let v0 = Arc::new(UInt64Vector::from_slice(&[10, 11, 12])) as VectorRef; + let sequences = Arc::new(UInt64Vector::from_slice(&[9, 9, 9])) as VectorRef; + let op_types = Arc::new(UInt8Vector::from_slice(&[0, 0, 0])) as VectorRef; - let k1 = UInt64Array::from_slice(&[0, 1, 2]); - let v0 = UInt64Array::from_slice(&[10, 11, 12]); - let sequences = UInt64Array::from_slice(&[9, 9, 9]); - let op_types = UInt8Array::from_slice(&[0, 0, 0]); - - assert_eq!(k0, &*batch.column(0).to_arrow_array()); - assert_eq!(k1, &*batch.column(1).to_arrow_array()); - assert_eq!(v0, &*batch.column(2).to_arrow_array()); - assert_eq!(sequences, &*batch.column(3).to_arrow_array()); - assert_eq!(op_types, &*batch.column(4).to_arrow_array()); + assert_eq!(k0, *batch.column(0)); + assert_eq!(k1, *batch.column(1)); + assert_eq!(v0, *batch.column(2)); + assert_eq!(sequences, *batch.column(3)); + assert_eq!(op_types, *batch.column(4)); }); } diff --git a/src/storage/src/metadata.rs b/src/storage/src/metadata.rs index 3808f4f14f..02fc437509 100644 --- a/src/storage/src/metadata.rs +++ b/src/storage/src/metadata.rs @@ -186,7 +186,7 @@ pub type VersionNumber = u32; // TODO(yingwen): We may need to hold a list of history schema. /// In memory metadata of region. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct RegionMetadata { // The following fields are immutable. id: RegionId, @@ -376,7 +376,7 @@ const METADATA_CF_ID_KEY: &str = "greptime:storage:cf_id"; const METADATA_COLUMN_ID_KEY: &str = "greptime:storage:column_id"; const METADATA_COMMENT_KEY: &str = "greptime:storage:comment"; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ColumnMetadata { pub cf_id: ColumnFamilyId, pub desc: ColumnDescriptor, @@ -458,7 +458,7 @@ where default_value.context(MetaNotFoundSnafu { key }) } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct ColumnsMetadata { /// All columns. /// @@ -926,7 +926,7 @@ mod tests { fn test_descriptor_to_region_metadata() { let region_name = "region-0"; let desc = RegionDescBuilder::new(region_name) - .timestamp(("ts", LogicalTypeId::Timestamp, false)) + .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) .enable_version_column(false) .push_key_column(("k1", LogicalTypeId::Int32, false)) .push_value_column(("v1", LogicalTypeId::Float32, true)) @@ -935,7 +935,7 @@ mod tests { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int32, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Float32, true), ], Some(1), @@ -1036,12 +1036,15 @@ mod tests { } fn new_metadata(enable_version_column: bool) -> RegionMetadata { - let timestamp = - ColumnDescriptorBuilder::new(2, "ts", ConcreteDataType::timestamp_millis_datatype()) - .is_nullable(false) - .is_time_index(true) - .build() - .unwrap(); + let timestamp = ColumnDescriptorBuilder::new( + 2, + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + ) + .is_nullable(false) + .is_time_index(true) + .build() + .unwrap(); let row_key = RowKeyDescriptorBuilder::new(timestamp) .push_column( ColumnDescriptorBuilder::new(3, "k1", ConcreteDataType::int64_datatype()) @@ -1078,7 +1081,7 @@ mod tests { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Int64, true), ], Some(1), @@ -1125,7 +1128,7 @@ mod tests { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("v1", LogicalTypeId::Int64, true), ], @@ -1266,7 +1269,7 @@ mod tests { fn test_validate_alter_request() { let builder = RegionDescBuilder::new("region-alter") .enable_version_column(false) - .timestamp(("ts", LogicalTypeId::Timestamp, false)) + .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) .push_key_column(("k0", LogicalTypeId::Int32, false)) .push_value_column(("v0", LogicalTypeId::Float32, true)) .push_value_column(("v1", LogicalTypeId::Float32, true)); diff --git a/src/storage/src/proto/write_batch.rs b/src/storage/src/proto/write_batch.rs index 7d5ef21aed..d710df9dc2 100644 --- a/src/storage/src/proto/write_batch.rs +++ b/src/storage/src/proto/write_batch.rs @@ -22,13 +22,18 @@ use common_error::prelude::*; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::{ScalarVector, ScalarVectorBuilder}; use datatypes::schema; +use datatypes::types::TimestampType; use datatypes::vectors::{ - BinaryVector, BinaryVectorBuilder, BooleanVector, BooleanVectorBuilder, Float32Vector, - Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, Int16VectorBuilder, - Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, Int8Vector, - Int8VectorBuilder, StringVector, StringVectorBuilder, TimestampVector, TimestampVectorBuilder, - UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, - UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, Vector, VectorRef, + BinaryVector, BinaryVectorBuilder, BooleanVector, BooleanVectorBuilder, DateTimeVector, + DateTimeVectorBuilder, DateVector, DateVectorBuilder, Float32Vector, Float32VectorBuilder, + Float64Vector, Float64VectorBuilder, Int16Vector, Int16VectorBuilder, Int32Vector, + Int32VectorBuilder, Int64Vector, Int64VectorBuilder, Int8Vector, Int8VectorBuilder, + StringVector, StringVectorBuilder, TimestampMicrosecondVector, + TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, UInt16Vector, UInt16VectorBuilder, + UInt32Vector, UInt32VectorBuilder, UInt64Vector, UInt64VectorBuilder, UInt8Vector, + UInt8VectorBuilder, Vector, VectorRef, }; use paste::paste; use snafu::OptionExt; @@ -148,7 +153,12 @@ impl From<&ConcreteDataType> for DataType { ConcreteDataType::String(_) => DataType::String, ConcreteDataType::Null(_) => DataType::Null, ConcreteDataType::Binary(_) => DataType::Binary, - ConcreteDataType::Timestamp(_) => DataType::Timestamp, + ConcreteDataType::Timestamp(unit) => match unit { + TimestampType::Second(_) => DataType::TimestampSecond, + TimestampType::Millisecond(_) => DataType::TimestampMillisecond, + TimestampType::Microsecond(_) => DataType::TimestampMicrosecond, + TimestampType::Nanosecond(_) => DataType::TimestampNanosecond, + }, ConcreteDataType::Date(_) | ConcreteDataType::DateTime(_) | ConcreteDataType::List(_) => { @@ -176,7 +186,12 @@ impl From for ConcreteDataType { DataType::String => ConcreteDataType::string_datatype(), DataType::Binary => ConcreteDataType::binary_datatype(), DataType::Null => ConcreteDataType::null_datatype(), - DataType::Timestamp => ConcreteDataType::timestamp_millis_datatype(), + DataType::Date => ConcreteDataType::date_datatype(), + DataType::Datetime => ConcreteDataType::datetime_datatype(), + DataType::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + DataType::TimestampMillisecond => ConcreteDataType::timestamp_millisecond_datatype(), + DataType::TimestampMicrosecond => ConcreteDataType::timestamp_microsecond_datatype(), + DataType::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), } } } @@ -239,7 +254,12 @@ gen_columns!(f64, Float64Vector, v, v); gen_columns!(bool, BooleanVector, v, v); gen_columns!(binary, BinaryVector, v, v.to_vec()); gen_columns!(string, StringVector, v, v.to_string()); -gen_columns!(timestamp, TimestampVector, v, v.value()); +gen_columns!(date, DateVector, v, v.val()); +gen_columns!(datetime, DateTimeVector, v, v.val()); +gen_columns!(ts_second, TimestampSecondVector, v, v.into()); +gen_columns!(ts_millisecond, TimestampMillisecondVector, v, v.into()); +gen_columns!(ts_microsecond, TimestampMicrosecondVector, v, v.into()); +gen_columns!(ts_nanosecond, TimestampNanosecondVector, v, v.into()); #[macro_export] macro_rules! gen_put_data { @@ -287,7 +307,27 @@ gen_put_data!(f64, Float64VectorBuilder, v, *v as f64); gen_put_data!(bool, BooleanVectorBuilder, v, *v); gen_put_data!(binary, BinaryVectorBuilder, v, v.as_slice()); gen_put_data!(string, StringVectorBuilder, v, v.as_str()); -gen_put_data!(timestamp, TimestampVectorBuilder, v, (*v).into()); +gen_put_data!(date, DateVectorBuilder, v, (*v).into()); +gen_put_data!(datetime, DateTimeVectorBuilder, v, (*v).into()); +gen_put_data!(ts_second, TimestampSecondVectorBuilder, v, (*v).into()); +gen_put_data!( + ts_millisecond, + TimestampMillisecondVectorBuilder, + v, + (*v).into() +); +gen_put_data!( + ts_microsecond, + TimestampMicrosecondVectorBuilder, + v, + (*v).into() +); +gen_put_data!( + ts_nanosecond, + TimestampNanosecondVectorBuilder, + v, + (*v).into() +); pub fn gen_columns(vector: &VectorRef) -> Result { let data_type = vector.data_type(); @@ -305,11 +345,15 @@ pub fn gen_columns(vector: &VectorRef) -> Result { ConcreteDataType::Float64(_) => gen_columns_f64(vector), ConcreteDataType::Binary(_) => gen_columns_binary(vector), ConcreteDataType::String(_) => gen_columns_string(vector), - ConcreteDataType::Timestamp(_) => gen_columns_timestamp(vector), - ConcreteDataType::Null(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::List(_) => { + ConcreteDataType::Date(_) => gen_columns_date(vector), + ConcreteDataType::DateTime(_) => gen_columns_datetime(vector), + ConcreteDataType::Timestamp(t) => match t { + TimestampType::Second(_) => gen_columns_ts_second(vector), + TimestampType::Millisecond(_) => gen_columns_ts_millisecond(vector), + TimestampType::Microsecond(_) => gen_columns_ts_microsecond(vector), + TimestampType::Nanosecond(_) => gen_columns_ts_nanosecond(vector), + }, + ConcreteDataType::Null(_) | ConcreteDataType::List(_) => { // TODO(jiachun): Maybe support some composite types in the future, such as list, struct, etc. unimplemented!("data type {:?} is not supported", data_type) } @@ -331,11 +375,15 @@ pub fn gen_put_data_vector(data_type: ConcreteDataType, column: Column) -> Resul ConcreteDataType::Float64(_) => gen_put_data_f64(column), ConcreteDataType::Binary(_) => gen_put_data_binary(column), ConcreteDataType::String(_) => gen_put_data_string(column), - ConcreteDataType::Timestamp(_) => gen_put_data_timestamp(column), - ConcreteDataType::Null(_) - | ConcreteDataType::Date(_) - | ConcreteDataType::DateTime(_) - | ConcreteDataType::List(_) => { + ConcreteDataType::Date(_) => gen_put_data_date(column), + ConcreteDataType::DateTime(_) => gen_put_data_datetime(column), + ConcreteDataType::Timestamp(t) => match t { + TimestampType::Second(_) => gen_put_data_ts_second(column), + TimestampType::Millisecond(_) => gen_put_data_ts_millisecond(column), + TimestampType::Microsecond(_) => gen_put_data_ts_microsecond(column), + TimestampType::Nanosecond(_) => gen_put_data_ts_nanosecond(column), + }, + ConcreteDataType::Null(_) | ConcreteDataType::List(_) => { // TODO(jiachun): Maybe support some composite types in the future, such as list, struct, etc. unimplemented!("data type {:?} is not supported", data_type) } diff --git a/src/storage/src/read/merge.rs b/src/storage/src/read/merge.rs index 75c5d112dd..b4f76b1f41 100644 --- a/src/storage/src/read/merge.rs +++ b/src/storage/src/read/merge.rs @@ -605,7 +605,7 @@ impl MergeReader { #[cfg(test)] mod tests { use datatypes::prelude::ScalarVector; - use datatypes::vectors::{Int64Vector, TimestampVector}; + use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use super::*; use crate::test_util::read_util; @@ -692,7 +692,7 @@ mod tests { let key = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let value = batch .column(1) @@ -703,7 +703,7 @@ mod tests { let batch: Vec<_> = key .iter_data() .zip(value.iter_data()) - .map(|(k, v)| (k.unwrap().value(), v)) + .map(|(k, v)| (k.unwrap().into(), v)) .collect(); result.push(batch); } diff --git a/src/storage/src/region/tests.rs b/src/storage/src/region/tests.rs index 57357f2da0..798eca19c1 100644 --- a/src/storage/src/region/tests.rs +++ b/src/storage/src/region/tests.rs @@ -20,10 +20,10 @@ mod flush; mod projection; use common_telemetry::logging; -use common_time::timestamp::Timestamp; -use datatypes::prelude::ScalarVector; +use datatypes::prelude::{ScalarVector, WrapperType}; +use datatypes::timestamp::TimestampMillisecond; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampVector}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use log_store::fs::log::LocalFileLogStore; use log_store::fs::noop::NoopLogStore; use object_store::backend::fs; @@ -70,7 +70,7 @@ impl TesterBase { /// /// Format of data: (timestamp, v0), timestamp is key, v0 is value. pub async fn put(&self, data: &[(i64, Option)]) -> WriteResponse { - let data: Vec<(Timestamp, Option)> = + let data: Vec<(TimestampMillisecond, Option)> = data.iter().map(|(l, r)| ((*l).into(), *r)).collect(); // Build a batch without version. let mut batch = new_write_batch_for_test(false); @@ -82,7 +82,7 @@ impl TesterBase { /// Put without version specified directly to inner writer. pub async fn put_inner(&self, data: &[(i64, Option)]) -> WriteResponse { - let data: Vec<(Timestamp, Option)> = + let data: Vec<(TimestampMillisecond, Option)> = data.iter().map(|(l, r)| ((*l).into(), *r)).collect(); let mut batch = new_write_batch_for_test(false); let put_data = new_put_data(&data); @@ -131,7 +131,11 @@ fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { if enable_version_column { write_batch_util::new_write_batch( &[ - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("v0", LogicalTypeId::Int64, true), ], @@ -140,7 +144,11 @@ fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { } else { write_batch_util::new_write_batch( &[ - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), ("v0", LogicalTypeId::Int64, true), ], Some(0), @@ -148,11 +156,12 @@ fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { } } -fn new_put_data(data: &[(Timestamp, Option)]) -> PutData { +fn new_put_data(data: &[(TimestampMillisecond, Option)]) -> PutData { let mut put_data = PutData::with_num_columns(2); - let timestamps = TimestampVector::from_vec(data.iter().map(|v| v.0).collect()); - let values = Int64Vector::from_iter(data.iter().map(|kv| kv.1)); + let timestamps = + TimestampMillisecondVector::from_vec(data.iter().map(|v| v.0.into()).collect()); + let values = Int64Vector::from_owned_iterator(data.iter().map(|kv| kv.1)); put_data .add_key_column(test_util::TIMESTAMP_NAME, Arc::new(timestamps)) @@ -167,14 +176,14 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option)>) { let timestamps = chunk.columns[0] .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let values = chunk.columns[1] .as_any() .downcast_ref::() .unwrap(); for (ts, value) in timestamps.iter_data().zip(values.iter_data()) { - dst.push((ts.unwrap().value(), value)); + dst.push((ts.unwrap().into_native(), value)); } } @@ -207,7 +216,11 @@ async fn test_new_region() { let expect_schema = schema_util::new_schema_ref( &[ ("k1", LogicalTypeId::Int32, false), - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), ("v0", LogicalTypeId::Float32, true), ], diff --git a/src/storage/src/region/tests/alter.rs b/src/storage/src/region/tests/alter.rs index 3ab273f1bb..4372e96c95 100644 --- a/src/storage/src/region/tests/alter.rs +++ b/src/storage/src/region/tests/alter.rs @@ -15,9 +15,9 @@ use std::collections::BTreeMap; use std::sync::Arc; -use common_time::Timestamp; use datatypes::prelude::*; -use datatypes::vectors::{Int64Vector, TimestampVector}; +use datatypes::timestamp::TimestampMillisecond; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use log_store::fs::log::LocalFileLogStore; use store_api::storage::{ AddColumn, AlterOperation, AlterRequest, Chunk, ChunkReader, ColumnDescriptor, @@ -53,7 +53,7 @@ struct AlterTester { #[derive(Debug, Clone, PartialEq)] struct DataRow { key: Option, - ts: Timestamp, + ts: TimestampMillisecond, v0: Option, v1: Option, } @@ -71,11 +71,14 @@ impl DataRow { fn new_put_data(data: &[DataRow]) -> PutData { let mut put_data = PutData::with_num_columns(4); - - let keys = Int64Vector::from_iter(data.iter().map(|v| v.key)); - let timestamps = TimestampVector::from_vec(data.iter().map(|v| v.ts).collect()); - let values1 = Int64Vector::from_iter(data.iter().map(|kv| kv.v0)); - let values2 = Int64Vector::from_iter(data.iter().map(|kv| kv.v1)); + let keys = Int64Vector::from(data.iter().map(|v| v.key).collect::>()); + let timestamps = TimestampMillisecondVector::from( + data.iter() + .map(|v| Some(v.ts.into_native())) + .collect::>(), + ); + let values1 = Int64Vector::from(data.iter().map(|kv| kv.v0).collect::>()); + let values2 = Int64Vector::from(data.iter().map(|kv| kv.v1).collect::>()); put_data.add_key_column("k0", Arc::new(keys)).unwrap(); put_data @@ -193,7 +196,7 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec) { .unwrap(); let ts_vector = chunk.columns[1] .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let v0_vector = chunk.columns[2] .as_any() @@ -206,7 +209,7 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec) { for i in 0..k0_vector.len() { dst.push(DataRow::new( k0_vector.get_data(i), - ts_vector.get_data(i).unwrap().value(), + ts_vector.get_data(i).unwrap().into(), v0_vector.get_data(i), v1_vector.get_data(i), )); diff --git a/src/storage/src/region/tests/projection.rs b/src/storage/src/region/tests/projection.rs index d607f41133..98d0e5026a 100644 --- a/src/storage/src/region/tests/projection.rs +++ b/src/storage/src/region/tests/projection.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::ScalarVector; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampVector}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; use log_store::fs::log::LocalFileLogStore; use store_api::logstore::LogStore; use store_api::storage::{ @@ -40,7 +40,11 @@ fn new_write_batch_for_test() -> WriteBatch { write_batch_util::new_write_batch( &[ ("k0", LogicalTypeId::Int64, false), - (test_util::TIMESTAMP_NAME, LogicalTypeId::Timestamp, false), + ( + test_util::TIMESTAMP_NAME, + LogicalTypeId::TimestampMillisecond, + false, + ), ("v0", LogicalTypeId::Int64, true), ("v1", LogicalTypeId::Int64, true), ], @@ -60,7 +64,7 @@ fn new_put_data(len: usize, key_start: i64, ts_start: i64, initial_value: i64) - let mut put_data = PutData::with_num_columns(4); let k0 = Int64Vector::from_values((0..len).map(|v| key_start + v as i64)); - let ts = TimestampVector::from_values((0..len).map(|v| ts_start + v as i64)); + let ts = TimestampMillisecondVector::from_values((0..len).map(|v| ts_start + v as i64)); let v0 = Int64Vector::from_values(std::iter::repeat(initial_value).take(len)); let v1 = Int64Vector::from_values((0..len).map(|v| initial_value + v as i64)); @@ -95,11 +99,11 @@ fn append_chunk_to(chunk: &Chunk, dst: &mut Vec>) { ConcreteDataType::Timestamp(_) => { let val = col .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() .get_data(i) .unwrap(); - row.push(val.value()); + row.push(val.into()); } _ => unreachable!(), } diff --git a/src/storage/src/schema.rs b/src/storage/src/schema.rs index 6bc344f3a7..dcec7ef1d1 100644 --- a/src/storage/src/schema.rs +++ b/src/storage/src/schema.rs @@ -25,7 +25,9 @@ pub use crate::schema::store::{StoreSchema, StoreSchemaRef}; mod tests { use std::sync::Arc; - use datatypes::vectors::{Int64Vector, UInt64Vector, UInt8Vector, VectorRef}; + use datatypes::vectors::{ + Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector, VectorRef, + }; use crate::read::Batch; @@ -37,7 +39,8 @@ mod tests { pub(crate) fn new_batch_with_num_values(num_value_columns: usize) -> Batch { let k0 = Int64Vector::from_slice(&[1, 2, 3]); - let timestamp = Int64Vector::from_slice(&[4, 5, 6]); + let timestamp = TimestampMillisecondVector::from_vec(vec![4, 5, 6]); + let mut columns: Vec = vec![Arc::new(k0), Arc::new(timestamp)]; for i in 0..num_value_columns { diff --git a/src/storage/src/schema/compat.rs b/src/storage/src/schema/compat.rs index 5e1c22cd77..d8d5f9a08a 100644 --- a/src/storage/src/schema/compat.rs +++ b/src/storage/src/schema/compat.rs @@ -14,11 +14,7 @@ //! Utilities for resolving schema compatibility problems. -use std::sync::Arc; - -use datatypes::arrow::array::Array; -use datatypes::arrow::chunk::Chunk; -use datatypes::arrow::datatypes::Field; +use datatypes::arrow::record_batch::RecordBatch; use datatypes::schema::SchemaRef; use datatypes::vectors::{Helper, VectorRef}; use snafu::{ensure, OptionExt, ResultExt}; @@ -230,36 +226,19 @@ impl ReadAdapter { self.source_columns_to_batch(source, num_rows) } - /// Returns list of fields need to read from the parquet file. - pub fn fields_to_read(&self) -> Vec { - if !self.need_compat() { - return self - .dest_schema - .schema_to_read() - .arrow_schema() - .fields - .clone(); - } - - self.source_schema - .arrow_schema() - .fields + /// Returns list of fields indices need to read from the parquet file. + pub fn fields_to_read(&self) -> Vec { + self.is_source_needed .iter() - .zip(self.is_source_needed.iter()) - .filter_map(|(field, is_needed)| { - if *is_needed { - Some(field.clone()) - } else { - None - } - }) - .collect() + .enumerate() + .filter_map(|(idx, needed)| if *needed { Some(idx) } else { None }) + .collect::>() } - /// Convert chunk read from the parquet file into [Batch]. + /// Convert [RecordBatch] read from the parquet file into [Batch]. /// - /// The chunk should have the same schema as [`ReadAdapter::fields_to_read()`]. - pub fn arrow_chunk_to_batch(&self, chunk: &Chunk>) -> Result { + /// The [RecordBatch] should have the same schema as [`ReadAdapter::fields_to_read()`]. + pub fn arrow_record_batch_to_batch(&self, record_batch: &RecordBatch) -> Result { let names = self .source_schema .schema() @@ -273,7 +252,8 @@ impl ReadAdapter { None } }); - let source = chunk + let source = record_batch + .columns() .iter() .zip(names) .map(|(column, name)| { @@ -281,11 +261,11 @@ impl ReadAdapter { }) .collect::>()?; - if !self.need_compat() || chunk.is_empty() { + if !self.need_compat() || record_batch.num_rows() == 0 { return Ok(Batch::new(source)); } - let num_rows = chunk.len(); + let num_rows = record_batch.num_rows(); self.source_columns_to_batch(source, num_rows) } @@ -323,8 +303,11 @@ impl ReadAdapter { #[cfg(test)] mod tests { + use std::sync::Arc; + use datatypes::data_type::ConcreteDataType; - use store_api::storage::{consts, ColumnDescriptorBuilder}; + use datatypes::schema::Schema; + use store_api::storage::ColumnDescriptorBuilder; use super::*; use crate::error::Error; @@ -332,12 +315,6 @@ mod tests { use crate::schema::{tests, ProjectedSchema, RegionSchema}; use crate::test_util::{descriptor_util, schema_util}; - fn check_fields(fields: &[Field], names: &[&str]) { - for (field, name) in fields.iter().zip(names) { - assert_eq!(&field.name, name); - } - } - fn call_batch_from_parts( adapter: &ReadAdapter, batch: &Batch, @@ -363,9 +340,26 @@ mod tests { } fn call_arrow_chunk_to_batch(adapter: &ReadAdapter, batch: &Batch) -> Batch { + let columns_schema = adapter + .source_schema + .columns() + .iter() + .zip(adapter.is_source_needed.iter()) + .filter_map(|(field, is_needed)| { + if *is_needed { + Some(field.to_column_schema().unwrap()) + } else { + None + } + }) + .collect::>(); + let arrow_schema = Schema::try_new(columns_schema) + .unwrap() + .arrow_schema() + .clone(); let arrays = batch.columns().iter().map(|v| v.to_arrow_array()).collect(); - let chunk = Chunk::new(arrays); - adapter.arrow_chunk_to_batch(&chunk).unwrap() + let chunk = RecordBatch::try_new(arrow_schema, arrays).unwrap(); + adapter.arrow_record_batch_to_batch(&chunk).unwrap() } fn check_arrow_chunk_to_batch_without_padding(adapter: &ReadAdapter, batch: &Batch) { @@ -404,7 +398,6 @@ mod tests { // (k0, timestamp, v0, v1) with version 0. let region_schema = Arc::new(schema_util::new_region_schema(0, 2)); let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema.clone())); - let source_schema = region_schema.store_schema().clone(); let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); @@ -414,17 +407,7 @@ mod tests { let batch = tests::new_batch_with_num_values(2); check_batch_from_parts_without_padding(&adapter, &batch, 2); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - "v1", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4, 5],); check_arrow_chunk_to_batch_without_padding(&adapter, &batch); } @@ -447,16 +430,7 @@ mod tests { let batch = tests::new_batch_with_num_values(1); check_batch_from_parts_without_padding(&adapter, &batch, 1); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5]); check_arrow_chunk_to_batch_without_padding(&adapter, &batch); } @@ -481,16 +455,7 @@ mod tests { let batch = tests::new_batch_with_num_values(1); check_batch_from_parts_without_padding(&adapter, &batch, 1); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4],); check_arrow_chunk_to_batch_without_padding(&adapter, &batch); } @@ -519,16 +484,7 @@ mod tests { // v2 is filled by null. check_batch_with_null_padding(&batch, &new_batch, &[3]); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v0", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5],); let new_batch = call_arrow_chunk_to_batch(&adapter, &batch); check_batch_with_null_padding(&batch, &new_batch, &[3]); @@ -567,16 +523,7 @@ mod tests { // v0 is filled by null. check_batch_with_null_padding(&batch, &new_batch, &[2]); - check_fields( - &adapter.fields_to_read(), - &[ - "k0", - "timestamp", - "v1", - consts::SEQUENCE_COLUMN_NAME, - consts::OP_TYPE_COLUMN_NAME, - ], - ); + assert_eq!(&adapter.fields_to_read(), &[0, 1, 3, 4, 5],); let new_batch = call_arrow_chunk_to_batch(&adapter, &batch); check_batch_with_null_padding(&batch, &new_batch, &[2]); diff --git a/src/storage/src/schema/projected.rs b/src/storage/src/schema/projected.rs index 6e746c9ff9..f50d431433 100644 --- a/src/storage/src/schema/projected.rs +++ b/src/storage/src/schema/projected.rs @@ -186,7 +186,6 @@ impl ProjectedSchema { .collect() } }; - Chunk::new(columns) } @@ -337,7 +336,7 @@ impl BatchOp for ProjectedSchema { mod tests { use datatypes::prelude::ScalarVector; use datatypes::type_id::LogicalTypeId; - use datatypes::vectors::{TimestampVector, VectorRef}; + use datatypes::vectors::{TimestampMillisecondVector, VectorRef}; use store_api::storage::OpType; use super::*; @@ -398,7 +397,7 @@ mod tests { let expect_user = schema_util::new_schema_with_version( &[ ("v1", LogicalTypeId::Int64, true), - ("timestamp", LogicalTypeId::Timestamp, false), + ("timestamp", LogicalTypeId::TimestampMillisecond, false), ], Some(1), 123, @@ -524,7 +523,7 @@ mod tests { let filter = BooleanVector::from_slice(&[true, false, true]); let res = schema.filter(&batch, &filter).unwrap(); - let expect: VectorRef = Arc::new(TimestampVector::from_values([1000, 3000])); + let expect: VectorRef = Arc::new(TimestampMillisecondVector::from_values([1000, 3000])); assert_eq!(expect, *res.column(0)); } } diff --git a/src/storage/src/schema/region.rs b/src/storage/src/schema/region.rs index bfc046c868..b6c0ef2a4e 100644 --- a/src/storage/src/schema/region.rs +++ b/src/storage/src/schema/region.rs @@ -32,7 +32,7 @@ use crate::schema::{StoreSchema, StoreSchemaRef}; /// /// The user schema is the schema that only contains columns that user could visit, /// as well as what the schema user created. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] pub struct RegionSchema { /// Schema that only contains columns that user defined, excluding internal columns /// that are reserved and used by the storage engine. @@ -162,7 +162,7 @@ mod tests { let expect_schema = schema_util::new_schema_with_version( &[ ("k0", LogicalTypeId::Int64, false), - ("timestamp", LogicalTypeId::Timestamp, false), + ("timestamp", LogicalTypeId::TimestampMillisecond, false), ("v0", LogicalTypeId::Int64, true), ], Some(1), diff --git a/src/storage/src/schema/store.rs b/src/storage/src/schema/store.rs index 681059b256..691320e8bd 100644 --- a/src/storage/src/schema/store.rs +++ b/src/storage/src/schema/store.rs @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use common_error::prelude::*; -use datatypes::arrow::array::Array; -use datatypes::arrow::chunk::Chunk as ArrowChunk; use datatypes::arrow::datatypes::Schema as ArrowSchema; -use datatypes::schema::{Metadata, Schema, SchemaBuilder, SchemaRef}; +use datatypes::arrow::record_batch::RecordBatch; +use datatypes::schema::{Schema, SchemaBuilder, SchemaRef}; use store_api::storage::consts; +use crate::error::NewRecordBatchSnafu; use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, Error, Result}; use crate::read::Batch; @@ -31,7 +32,7 @@ const USER_COLUMN_END_KEY: &str = "greptime:storage:user_column_end"; /// /// Used internally, contains all row key columns, internal columns and a sub set of /// value columns in a region. The columns are organized in `key, value, internal` order. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] pub struct StoreSchema { columns: Vec, schema: SchemaRef, @@ -57,10 +58,16 @@ impl StoreSchema { self.schema.arrow_schema() } - pub fn batch_to_arrow_chunk(&self, batch: &Batch) -> ArrowChunk> { - assert_eq!(self.schema.num_columns(), batch.num_columns()); - - ArrowChunk::new(batch.columns().iter().map(|v| v.to_arrow_array()).collect()) + pub fn batch_to_arrow_record_batch( + &self, + batch: &Batch, + ) -> std::result::Result { + assert_eq!(self.schema.num_columns(), batch.num_columns(),); + RecordBatch::try_new( + self.schema.arrow_schema().clone(), + batch.columns().iter().map(|v| v.to_arrow_array()).collect(), + ) + .context(NewRecordBatchSnafu) } pub(crate) fn contains_column(&self, name: &str) -> bool { @@ -181,10 +188,10 @@ impl StoreSchema { } } -impl TryFrom for StoreSchema { +impl TryFrom> for StoreSchema { type Error = Error; - fn try_from(arrow_schema: ArrowSchema) -> Result { + fn try_from(arrow_schema: Arc) -> std::result::Result { let schema = Schema::try_from(arrow_schema).context(metadata::ConvertArrowSchemaSnafu)?; // Recover other metadata from schema. let row_key_end = parse_index_from_metadata(schema.metadata(), ROW_KEY_END_KEY)?; @@ -216,7 +223,15 @@ impl TryFrom for StoreSchema { } } -fn parse_index_from_metadata(metadata: &Metadata, key: &str) -> Result { +impl TryFrom for StoreSchema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchema) -> std::result::Result { + StoreSchema::try_from(Arc::new(arrow_schema)) + } +} + +fn parse_index_from_metadata(metadata: &HashMap, key: &str) -> Result { let value = metadata .get(key) .context(metadata::MetaNotFoundSnafu { key })?; @@ -227,20 +242,17 @@ fn parse_index_from_metadata(metadata: &Metadata, key: &str) -> Result { #[cfg(test)] mod tests { - use datatypes::arrow::array::Array; - use datatypes::arrow::chunk::Chunk as ArrowChunk; - use super::*; use crate::read::Batch; use crate::schema::tests; use crate::test_util::schema_util; - fn check_chunk_batch(chunk: &ArrowChunk>, batch: &Batch) { - assert_eq!(5, chunk.columns().len()); - assert_eq!(3, chunk.len()); + fn check_chunk_batch(record_batch: &RecordBatch, batch: &Batch) { + assert_eq!(5, record_batch.num_columns()); + assert_eq!(3, record_batch.num_rows()); for i in 0..5 { - assert_eq!(chunk[i], batch.column(i).to_arrow_array()); + assert_eq!(record_batch.column(i), &batch.column(i).to_arrow_array()); } } @@ -280,7 +292,7 @@ mod tests { // Test batch and chunk conversion. let batch = tests::new_batch(); // Convert batch to chunk. - let chunk = store_schema.batch_to_arrow_chunk(&batch); + let chunk = store_schema.batch_to_arrow_record_batch(&batch).unwrap(); check_chunk_batch(&chunk, &batch); } } diff --git a/src/storage/src/sst.rs b/src/storage/src/sst.rs index 273841809e..afe1c10fdf 100644 --- a/src/storage/src/sst.rs +++ b/src/storage/src/sst.rs @@ -259,7 +259,7 @@ impl AccessLayer for FsAccessLayer { opts.predicate.clone(), ); - let stream = reader.chunk_stream(opts.batch_size).await?; + let stream = reader.chunk_stream().await?; Ok(Box::new(stream)) } } diff --git a/src/storage/src/sst/parquet.rs b/src/storage/src/sst/parquet.rs index 1244582b69..5bde4ac4e4 100644 --- a/src/storage/src/sst/parquet.rs +++ b/src/storage/src/sst/parquet.rs @@ -18,28 +18,23 @@ use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; +use async_compat::CompatExt; use async_stream::try_stream; use async_trait::async_trait; -use common_telemetry::debug; -use datatypes::arrow::array::Array; -use datatypes::arrow::chunk::Chunk; -use datatypes::arrow::datatypes::{DataType, Schema}; -use datatypes::arrow::io::parquet::read::{ - infer_schema, read_columns_many_async, read_metadata_async, RowGroupDeserializer, -}; -use datatypes::arrow::io::parquet::write::{ - Compression, Encoding, FileSink, Version, WriteOptions, -}; -use futures::io::BufReader; -use futures::AsyncWriteExt; -use futures_util::sink::SinkExt; -use futures_util::{try_join, Stream, TryStreamExt}; -use object_store::{ObjectStore, SeekableReader}; -use sluice::pipe; +use datatypes::arrow::record_batch::RecordBatch; +use futures_util::{Stream, StreamExt, TryStreamExt}; +use object_store::ObjectStore; +use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::basic::{Compression, Encoding}; +use parquet::file::metadata::KeyValue; +use parquet::file::properties::WriterProperties; use snafu::ResultExt; use table::predicate::Predicate; +use tokio::io::BufReader; -use crate::error::{self, Result}; +use crate::error::{ + self, NewRecordBatchSnafu, ReadParquetSnafu, Result, WriteObjectSnafu, WriteParquetSnafu, +}; use crate::memtable::BoxedBatchIterator; use crate::read::{Batch, BatchReader}; use crate::schema::compat::ReadAdapter; @@ -51,6 +46,7 @@ pub struct ParquetWriter<'a> { file_path: &'a str, iter: BoxedBatchIterator, object_store: ObjectStore, + max_row_group_size: usize, } impl<'a> ParquetWriter<'a> { @@ -63,6 +59,7 @@ impl<'a> ParquetWriter<'a> { file_path, iter, object_store, + max_row_group_size: 4096, // TODO(hl): make this configurable } } @@ -76,122 +73,46 @@ impl<'a> ParquetWriter<'a> { async fn write_rows(self, extra_meta: Option>) -> Result<()> { let projected_schema = self.iter.schema(); let store_schema = projected_schema.schema_to_read(); - let schema = store_schema.arrow_schema(); + let schema = store_schema.arrow_schema().clone(); let object = self.object_store.object(self.file_path); - let (reader, mut writer) = pipe::pipe(); + let writer_props = WriterProperties::builder() + .set_compression(Compression::ZSTD) + .set_encoding(Encoding::PLAIN) + .set_max_row_group_size(self.max_row_group_size) + .set_key_value_metadata(extra_meta.map(|map| { + map.iter() + .map(|(k, v)| KeyValue::new(k.clone(), v.clone())) + .collect::>() + })) + .build(); - // now all physical types use plain encoding, maybe let caller to choose encoding for each type. - let encodings = get_encoding_for_schema(schema, |_| Encoding::Plain); - try_join!( - async { - // FIXME(hl): writer size is not used in fs backend so just leave it to 0, - // but in s3/azblob backend the Content-Length field of HTTP request is set - // to this value. - object - .write_from(0, reader) - .await - .context(error::FlushIoSnafu) - }, - async { - let mut sink = FileSink::try_new( - &mut writer, - // The file sink needs the `Schema` instead of a reference. - (**schema).clone(), - encodings, - WriteOptions { - write_statistics: true, - compression: Compression::Gzip, - version: Version::V2, - }, - ) - .context(error::WriteParquetSnafu)?; - - for batch in self.iter { - let batch = batch?; - sink.send(store_schema.batch_to_arrow_chunk(&batch)) - .await - .context(error::WriteParquetSnafu)?; - } - - if let Some(meta) = extra_meta { - for (k, v) in meta { - sink.metadata.insert(k, Some(v)); - } - } - sink.close().await.context(error::WriteParquetSnafu)?; - drop(sink); - - writer - .close() - .await - .map_err(|err| { - object_store::Error::new( - object_store::ErrorKind::Unexpected, - "writer close failed", - ) - .set_source(err) - }) - .context(error::WriteObjectSnafu { - path: self.file_path, - }) - } - ) - .map(|_| ()) - } -} - -fn get_encoding_for_schema Encoding + Clone>( - schema: &Schema, - map: F, -) -> Vec { - schema - .fields - .iter() - .flat_map(|f| transverse(&f.data_type, map.clone())) - .collect() -} - -// TODO(hl): backport from arrow2 v0.12 (https://github.com/jorgecarleitao/arrow2/blob/f57dbd5dbc61b940a71decd5f81d0fd4c93b158d/src/io/parquet/write/mod.rs#L454-L509) -// remove it when upgrade to newer version -pub fn transverse T + Clone>(data_type: &DataType, map: F) -> Vec { - let mut encodings = vec![]; - transverse_recursive(data_type, map, &mut encodings); - encodings -} - -fn transverse_recursive T + Clone>( - data_type: &DataType, - map: F, - encodings: &mut Vec, -) { - use datatypes::arrow::datatypes::PhysicalType::*; - match data_type.to_physical_type() { - Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 - | Dictionary(_) | LargeUtf8 => encodings.push(map(data_type)), - List | FixedSizeList | LargeList => { - let a = data_type.to_logical_type(); - if let DataType::List(inner) = a { - transverse_recursive(&inner.data_type, map, encodings) - } else if let DataType::LargeList(inner) = a { - transverse_recursive(&inner.data_type, map, encodings) - } else if let DataType::FixedSizeList(inner, _) = a { - transverse_recursive(&inner.data_type, map, encodings) - } else { - unreachable!() - } + // TODO(hl): Since OpenDAL's writer is async and ArrowWriter requires a `std::io::Write`, + // here we use a Vec to buffer all parquet bytes in memory and write to object store + // at a time. Maybe we should find a better way to brige ArrowWriter and OpenDAL's object. + let mut buf = vec![]; + let mut arrow_writer = ArrowWriter::try_new(&mut buf, schema.clone(), Some(writer_props)) + .context(WriteParquetSnafu)?; + for batch in self.iter { + let batch = batch?; + let arrow_batch = RecordBatch::try_new( + schema.clone(), + batch + .columns() + .iter() + .map(|v| v.to_arrow_array()) + .collect::>(), + ) + .context(NewRecordBatchSnafu)?; + arrow_writer + .write(&arrow_batch) + .context(WriteParquetSnafu)?; } - Struct => { - if let DataType::Struct(fields) = data_type.to_logical_type() { - for field in fields { - transverse_recursive(&field.data_type, map.clone(), encodings) - } - } else { - unreachable!() - } - } - Union => todo!(), - Map => todo!(), + arrow_writer.close().context(WriteParquetSnafu)?; + object.write(buf).await.context(WriteObjectSnafu { + path: object.path(), + })?; + Ok(()) } } @@ -202,9 +123,6 @@ pub struct ParquetReader<'a> { predicate: Predicate, } -type ReaderFactoryFuture<'a, R> = - Pin> + Send + 'a>>; - impl<'a> ParquetReader<'a> { pub fn new( file_path: &str, @@ -220,61 +138,48 @@ impl<'a> ParquetReader<'a> { } } - pub async fn chunk_stream(&self, chunk_size: usize) -> Result { - let file_path = self.file_path.to_string(); + pub async fn chunk_stream(&self) -> Result { let operator = self.object_store.clone(); - let reader_factory = move || -> ReaderFactoryFuture { - let file_path = file_path.clone(); - let operator = operator.clone(); - Box::pin(async move { Ok(operator.object(&file_path).seekable_reader(..)) }) - }; - - let file_path = self.file_path.to_string(); - let reader = reader_factory() + let reader = operator.object(self.file_path).seekable_reader(..).compat(); + let buf_reader = BufReader::new(reader); + let builder = ParquetRecordBatchStreamBuilder::new(buf_reader) .await - .context(error::ReadParquetIoSnafu { file: &file_path })?; - // Use BufReader to alleviate consumption bring by random seek and small IO. - let mut buf_reader = BufReader::new(reader); - let metadata = read_metadata_async(&mut buf_reader) - .await - .context(error::ReadParquetSnafu { file: &file_path })?; + .context(ReadParquetSnafu { + file: self.file_path, + })?; + let arrow_schema = builder.schema().clone(); - let arrow_schema = - infer_schema(&metadata).context(error::ReadParquetSnafu { file: &file_path })?; - let store_schema = Arc::new( - StoreSchema::try_from(arrow_schema) - .context(error::ConvertStoreSchemaSnafu { file: &file_path })?, - ); + let store_schema = Arc::new(StoreSchema::try_from(arrow_schema).context( + error::ConvertStoreSchemaSnafu { + file: self.file_path, + }, + )?); let adapter = ReadAdapter::new(store_schema.clone(), self.projected_schema.clone())?; - let pruned_row_groups = self - .predicate - .prune_row_groups(store_schema.schema().clone(), &metadata.row_groups); + let pruned_row_groups = self.predicate.prune_row_groups( + store_schema.schema().clone(), + builder.metadata().row_groups(), + ); - let projected_fields = adapter.fields_to_read(); + let projection = ProjectionMask::roots( + builder.metadata().file_metadata().schema_descr(), + adapter.fields_to_read(), + ); + + let mut masked_stream = builder + .with_projection(projection) + .build() + .context(ReadParquetSnafu { + file: self.file_path, + })? + .zip(futures_util::stream::iter(pruned_row_groups.into_iter())); + + let file_name = self.file_path.to_string(); let chunk_stream = try_stream!({ - for (idx, valid) in pruned_row_groups.iter().enumerate() { - if !valid { - debug!("Pruned {} row groups", idx); - continue; - } - - let rg = &metadata.row_groups[idx]; - let column_chunks = read_columns_many_async( - &reader_factory, - rg, - projected_fields.clone(), - Some(chunk_size), - ) - .await - .context(error::ReadParquetSnafu { file: &file_path })?; - - let chunks = RowGroupDeserializer::new(column_chunks, rg.num_rows() as usize, None); - for maybe_chunk in chunks { - let columns_in_chunk = - maybe_chunk.context(error::ReadParquetSnafu { file: &file_path })?; - yield columns_in_chunk; + while let Some((record_batch, valid)) = masked_stream.next().await { + if valid { + yield record_batch.context(ReadParquetSnafu { file: &file_name })? } } }); @@ -283,7 +188,7 @@ impl<'a> ParquetReader<'a> { } } -pub type SendableChunkStream = Pin>>> + Send>>; +pub type SendableChunkStream = Pin> + Send>>; pub struct ChunkStream { adapter: ReadAdapter, @@ -302,7 +207,7 @@ impl BatchReader for ChunkStream { self.stream .try_next() .await? - .map(|chunk| self.adapter.arrow_chunk_to_batch(&chunk)) + .map(|rb| self.adapter.arrow_record_batch_to_batch(&rb)) .transpose() } } @@ -311,10 +216,9 @@ impl BatchReader for ChunkStream { mod tests { use std::sync::Arc; - use datatypes::arrow::array::{Array, UInt64Array, UInt8Array}; - use datatypes::arrow::io::parquet::read::FileReader; - use datatypes::prelude::{ScalarVector, Vector}; - use datatypes::vectors::TimestampVector; + use datatypes::arrow::array::{Array, ArrayRef, UInt64Array, UInt8Array}; + use datatypes::prelude::Vector; + use datatypes::vectors::TimestampMillisecondVector; use object_store::backend::fs::Builder; use store_api::storage::OpType; use tempdir::TempDir; @@ -323,6 +227,7 @@ mod tests { use crate::memtable::{ tests as memtable_tests, DefaultMemtableBuilder, IterContext, MemtableBuilder, }; + use crate::schema::ProjectedSchema; #[tokio::test] async fn test_parquet_writer() { @@ -357,7 +262,7 @@ mod tests { let object_store = ObjectStore::new(backend); let sst_file_name = "test-flush.parquet"; let iter = memtable.iter(&IterContext::default()).unwrap(); - let writer = ParquetWriter::new(sst_file_name, iter, object_store); + let writer = ParquetWriter::new(sst_file_name, iter, object_store.clone()); writer .write_sst(&sst::WriteOptions::default()) @@ -365,17 +270,23 @@ mod tests { .unwrap(); // verify parquet file + let reader = BufReader::new( + object_store + .object(sst_file_name) + .seekable_reader(..) + .compat(), + ); - let reader = std::fs::File::open(dir.path().join(sst_file_name)).unwrap(); - let mut file_reader = FileReader::try_new(reader, None, Some(128), None, None).unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); + let mut stream = builder.build().unwrap(); // chunk schema: timestamp, __version, v1, __sequence, __op_type - let chunk = file_reader.next().unwrap().unwrap(); - assert_eq!(6, chunk.arrays().len()); + let chunk = stream.next().await.unwrap().unwrap(); + assert_eq!(6, chunk.columns().len()); // timestamp assert_eq!( - TimestampVector::from_slice(&[ + &TimestampMillisecondVector::from_slice(&[ 1000.into(), 1000.into(), 1001.into(), @@ -384,39 +295,107 @@ mod tests { 2003.into() ]) .to_arrow_array(), - chunk.arrays()[0] + chunk.column(0) ); // version assert_eq!( - Arc::new(UInt64Array::from_slice(&[1, 2, 1, 1, 1, 5])) as Arc, - chunk.arrays()[1] + &(Arc::new(UInt64Array::from(vec![1, 2, 1, 1, 1, 5])) as ArrayRef), + chunk.column(1) ); // v0 assert_eq!( - Arc::new(UInt64Array::from_slice(&[1, 2, 3, 7, 8, 9])) as Arc, - chunk.arrays()[2] + &(Arc::new(UInt64Array::from(vec![1, 2, 3, 7, 8, 9])) as Arc), + chunk.column(2) ); // v1 assert_eq!( - Arc::new(UInt64Array::from_slice(&[ - 1234, 1234, 1234, 1234, 1234, 1234 - ])) as Arc, - chunk.arrays()[3] + &(Arc::new(UInt64Array::from(vec![1234, 1234, 1234, 1234, 1234, 1234])) + as Arc), + chunk.column(3) ); // sequence assert_eq!( - Arc::new(UInt64Array::from_slice(&[10, 10, 10, 10, 10, 10])) as Arc, - chunk.arrays()[4] + &(Arc::new(UInt64Array::from(vec![10, 10, 10, 10, 10, 10])) as Arc), + chunk.column(4) ); // op_type assert_eq!( - Arc::new(UInt8Array::from_slice(&[0, 0, 0, 0, 0, 0])) as Arc, - chunk.arrays()[5] + &(Arc::new(UInt8Array::from(vec![0, 0, 0, 0, 0, 0])) as Arc), + chunk.column(5) + ); + } + + #[tokio::test] + async fn test_parquet_reader() { + common_telemetry::init_default_ut_logging(); + let schema = memtable_tests::schema_for_test(); + let memtable = DefaultMemtableBuilder::default().build(schema.clone()); + + memtable_tests::write_kvs( + &*memtable, + 10, // sequence + OpType::Put, + &[ + (1000, 1), + (1000, 2), + (2002, 1), + (2003, 1), + (2003, 5), + (1001, 1), + ], // keys + &[ + (Some(1), Some(1234)), + (Some(2), Some(1234)), + (Some(7), Some(1234)), + (Some(8), Some(1234)), + (Some(9), Some(1234)), + (Some(3), Some(1234)), + ], // values + ); + + let dir = TempDir::new("write_parquet").unwrap(); + let path = dir.path().to_str().unwrap(); + let backend = Builder::default().root(path).build().unwrap(); + let object_store = ObjectStore::new(backend); + let sst_file_name = "test-read.parquet"; + let iter = memtable.iter(&IterContext::default()).unwrap(); + let writer = ParquetWriter::new(sst_file_name, iter, object_store.clone()); + + writer + .write_sst(&sst::WriteOptions::default()) + .await + .unwrap(); + + let operator = ObjectStore::new( + object_store::backend::fs::Builder::default() + .root(dir.path().to_str().unwrap()) + .build() + .unwrap(), + ); + + let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap()); + let reader = ParquetReader::new( + "test-read.parquet", + operator, + projected_schema, + Predicate::empty(), + ); + + let mut stream = reader.chunk_stream().await.unwrap(); + assert_eq!( + 6, + stream + .next_batch() + .await + .transpose() + .unwrap() + .unwrap() + .num_rows() ); } } diff --git a/src/storage/src/test_util/descriptor_util.rs b/src/storage/src/test_util/descriptor_util.rs index 50c8c2613e..10d682745b 100644 --- a/src/storage/src/test_util/descriptor_util.rs +++ b/src/storage/src/test_util/descriptor_util.rs @@ -37,7 +37,7 @@ impl RegionDescBuilder { ColumnDescriptorBuilder::new( 1, test_util::TIMESTAMP_NAME, - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .is_nullable(false) .is_time_index(true) diff --git a/src/storage/src/test_util/read_util.rs b/src/storage/src/test_util/read_util.rs index 1b62611dc6..fe231de8ae 100644 --- a/src/storage/src/test_util/read_util.rs +++ b/src/storage/src/test_util/read_util.rs @@ -15,9 +15,9 @@ use std::sync::Arc; use async_trait::async_trait; -use datatypes::prelude::ScalarVector; +use datatypes::prelude::{ScalarVector, WrapperType}; use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampVector, UInt64Vector, UInt8Vector}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector}; use store_api::storage::OpType; use crate::error::Result; @@ -45,8 +45,12 @@ pub fn new_projected_schema() -> ProjectedSchemaRef { /// Build a new batch, with 0 sequence and op_type. pub fn new_kv_batch(key_values: &[(i64, Option)]) -> Batch { - let key = Arc::new(TimestampVector::from_values(key_values.iter().map(|v| v.0))); - let value = Arc::new(Int64Vector::from_iter(key_values.iter().map(|v| v.1))); + let key = Arc::new(TimestampMillisecondVector::from_values( + key_values.iter().map(|v| v.0), + )); + let value = Arc::new(Int64Vector::from( + key_values.iter().map(|v| v.1).collect::>(), + )); let sequences = Arc::new(UInt64Vector::from_vec(vec![0; key_values.len()])); let op_types = Arc::new(UInt8Vector::from_vec(vec![0; key_values.len()])); @@ -55,7 +59,9 @@ pub fn new_kv_batch(key_values: &[(i64, Option)]) -> Batch { /// Build a new batch from (key, value, sequence, op_type) pub fn new_full_kv_batch(all_values: &[(i64, i64, u64, OpType)]) -> Batch { - let key = Arc::new(TimestampVector::from_values(all_values.iter().map(|v| v.0))); + let key = Arc::new(TimestampMillisecondVector::from_values( + all_values.iter().map(|v| v.0), + )); let value = Arc::new(Int64Vector::from_values(all_values.iter().map(|v| v.1))); let sequences = Arc::new(UInt64Vector::from_values(all_values.iter().map(|v| v.2))); let op_types = Arc::new(UInt8Vector::from_values( @@ -70,7 +76,7 @@ fn check_kv_batch(batches: &[Batch], expect: &[&[(i64, Option)]]) { let key = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let value = batch .column(1) @@ -79,7 +85,7 @@ fn check_kv_batch(batches: &[Batch], expect: &[&[(i64, Option)]]) { .unwrap(); for (i, (k, v)) in key_values.iter().enumerate() { - assert_eq!(key.get_data(i).unwrap().value(), *k); + assert_eq!(key.get_data(i).unwrap().into_native(), *k); assert_eq!(value.get_data(i), *v,); } } @@ -92,7 +98,7 @@ pub async fn collect_kv_batch(reader: &mut dyn BatchReader) -> Vec<(i64, Option< let key = batch .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let value = batch .column(1) @@ -101,7 +107,7 @@ pub async fn collect_kv_batch(reader: &mut dyn BatchReader) -> Vec<(i64, Option< .unwrap(); for (k, v) in key.iter_data().zip(value.iter_data()) { - result.push((k.unwrap().value(), v)); + result.push((k.unwrap().into(), v)); } } diff --git a/src/storage/src/write_batch.rs b/src/storage/src/write_batch.rs index 10d66bcd25..7f8768298f 100644 --- a/src/storage/src/write_batch.rs +++ b/src/storage/src/write_batch.rs @@ -26,7 +26,7 @@ use datatypes::arrow::error::ArrowError; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::{ScalarVector, Value}; use datatypes::schema::{ColumnSchema, SchemaRef}; -use datatypes::vectors::{Int64Vector, TimestampVector, VectorRef}; +use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; use prost::{DecodeError, EncodeError}; use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::{consts, PutOperation, WriteRequest}; @@ -116,9 +116,6 @@ pub enum Error { source: datatypes::error::Error, }, - #[snafu(display("Failed to decode, in stream waiting state"))] - StreamWaiting { backtrace: Backtrace }, - #[snafu(display("Failed to decode, corrupted data {}", message))] DataCorrupted { message: String, @@ -230,11 +227,13 @@ impl WriteRequest for WriteBatch { } else { match column.data_type() { ConcreteDataType::Timestamp(_) => { - let ts_vector = - column.as_any().downcast_ref::().unwrap(); + let ts_vector = column + .as_any() + .downcast_ref::() + .unwrap(); for ts in ts_vector.iter_data().flatten() { - let aligned = align_timestamp(ts.value(), durations_millis) - .context(TimestampOverflowSnafu { ts: ts.value() })?; + let aligned = align_timestamp(ts.into(), durations_millis) + .context(TimestampOverflowSnafu { ts: i64::from(ts) })?; aligned_timestamps.insert(aligned); } } @@ -505,9 +504,9 @@ pub mod codec { use std::io::Cursor; use std::sync::Arc; - use datatypes::arrow::chunk::Chunk as ArrowChunk; - use datatypes::arrow::io::ipc::read::{self, StreamReader, StreamState}; - use datatypes::arrow::io::ipc::write::{StreamWriter, WriteOptions}; + use datatypes::arrow::ipc::reader::StreamReader; + use datatypes::arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; + use datatypes::arrow::record_batch::RecordBatch; use datatypes::schema::{Schema, SchemaRef}; use datatypes::vectors::Helper; use prost::Message; @@ -520,8 +519,8 @@ pub mod codec { use crate::write_batch::{ DataCorruptedSnafu, DecodeArrowSnafu, DecodeProtobufSnafu, DecodeVectorSnafu, EncodeArrowSnafu, EncodeProtobufSnafu, Error as WriteBatchError, FromProtobufSnafu, - MissingColumnSnafu, Mutation, ParseSchemaSnafu, PutData, Result, StreamWaitingSnafu, - ToProtobufSnafu, WriteBatch, + MissingColumnSnafu, Mutation, ParseSchemaSnafu, PutData, Result, ToProtobufSnafu, + WriteBatch, }; // TODO(jiachun): We can make a comparison with protobuf, including performance, storage cost, @@ -543,12 +542,12 @@ pub mod codec { let item_schema = item.schema(); let arrow_schema = item_schema.arrow_schema(); - let opts = WriteOptions { compression: None }; - let mut writer = StreamWriter::new(dst, opts); - writer.start(arrow_schema, None).context(EncodeArrowSnafu)?; + let opts = IpcWriteOptions::default(); + let mut writer = StreamWriter::try_new_with_options(dst, arrow_schema, opts) + .context(EncodeArrowSnafu)?; for mutation in item.iter() { - let chunk = match mutation { + let rb = match mutation { Mutation::Put(put) => { let arrays = item_schema .column_schemas() @@ -563,15 +562,13 @@ pub mod codec { }) .collect::>>()?; - ArrowChunk::try_new(arrays).context(EncodeArrowSnafu)? + RecordBatch::try_new(arrow_schema.clone(), arrays) + .context(EncodeArrowSnafu)? } }; - - writer.write(&chunk, None).context(EncodeArrowSnafu)?; + writer.write(&rb).context(EncodeArrowSnafu)?; } - writer.finish().context(EncodeArrowSnafu)?; - Ok(()) } } @@ -591,20 +588,14 @@ pub mod codec { type Error = WriteBatchError; fn decode(&self, src: &[u8]) -> Result { - let mut reader = Cursor::new(src); - let metadata = read::read_stream_metadata(&mut reader).context(DecodeArrowSnafu)?; - let mut reader = StreamReader::new(reader, metadata); - let arrow_schema = reader.metadata().schema.clone(); - + let reader = Cursor::new(src); + let mut reader = StreamReader::try_new(reader, None).context(DecodeArrowSnafu)?; + let arrow_schema = reader.schema(); let mut chunks = Vec::with_capacity(self.mutation_types.len()); - for stream_state in reader.by_ref() { - let stream_state = stream_state.context(DecodeArrowSnafu)?; - let chunk = match stream_state { - StreamState::Some(chunk) => chunk, - StreamState::Waiting => return StreamWaitingSnafu {}.fail(), - }; - chunks.push(chunk); + for maybe_record_batch in reader.by_ref() { + let record_batch = maybe_record_batch.context(DecodeArrowSnafu)?; + chunks.push(record_batch); } // check if exactly finished @@ -629,12 +620,15 @@ pub mod codec { let schema = Arc::new(Schema::try_from(arrow_schema).context(ParseSchemaSnafu)?); let mut write_batch = WriteBatch::new(schema.clone()); - for (mutation_type, chunk) in self.mutation_types.iter().zip(chunks.into_iter()) { + for (mutation_type, record_batch) in self.mutation_types.iter().zip(chunks.into_iter()) + { match MutationType::from_i32(*mutation_type) { Some(MutationType::Put) => { let mut put_data = PutData::with_num_columns(schema.num_columns()); - for (column_schema, array) in - schema.column_schemas().iter().zip(chunk.arrays().iter()) + for (column_schema, array) in schema + .column_schemas() + .iter() + .zip(record_batch.columns().iter()) { let vector = Helper::try_into_vector(array).context(DecodeVectorSnafu)?; @@ -654,7 +648,6 @@ pub mod codec { } } } - Ok(write_batch) } } @@ -787,7 +780,8 @@ mod tests { use datatypes::type_id::LogicalTypeId; use datatypes::vectors::{ - BooleanVector, ConstantVector, Int32Vector, Int64Vector, UInt64Vector, + BooleanVector, ConstantVector, Int32Vector, Int64Vector, TimestampMillisecondVector, + UInt64Vector, }; use super::*; @@ -835,7 +829,7 @@ mod tests { &[ ("k1", LogicalTypeId::UInt64, false), (consts::VERSION_COLUMN_NAME, LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::Timestamp, false), + ("ts", LogicalTypeId::TimestampMillisecond, false), ("v1", LogicalTypeId::Boolean, true), ], Some(2), @@ -846,7 +840,7 @@ mod tests { fn test_write_batch_put() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); - let tsv = Arc::new(TimestampVector::from_vec(vec![0, 0, 0])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])); let mut put_data = PutData::new(); put_data.add_key_column("k1", intv.clone()).unwrap(); @@ -872,8 +866,8 @@ mod tests { #[test] fn test_write_batch_too_large() { - let boolv = Arc::new(BooleanVector::from_iter( - iter::repeat(Some(true)).take(MAX_BATCH_SIZE + 1), + let boolv = Arc::new(BooleanVector::from_iterator( + iter::repeat(true).take(MAX_BATCH_SIZE + 1), )); let mut put_data = PutData::new(); @@ -922,7 +916,7 @@ mod tests { #[test] fn test_put_type_has_null() { - let intv = Arc::new(UInt64Vector::from_iter(&[Some(1), None, Some(3)])); + let intv = Arc::new(UInt64Vector::from(vec![Some(1), None, Some(3)])); let tsv = Arc::new(Int64Vector::from_vec(vec![0, 0, 0])); let mut put_data = PutData::new(); @@ -950,7 +944,7 @@ mod tests { #[test] fn test_put_unknown_column() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); - let tsv = Arc::new(TimestampVector::from_vec(vec![0, 0, 0])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])); let boolv = Arc::new(BooleanVector::from(vec![true, false, true])); let mut put_data = PutData::new(); @@ -990,7 +984,9 @@ mod tests { #[test] pub fn test_write_batch_time_range() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4, 5, 6])); - let tsv = Arc::new(TimestampVector::from_vec(vec![-21, -20, -1, 0, 1, 20])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![ + -21, -20, -1, 0, 1, 20, + ])); let boolv = Arc::new(BooleanVector::from(vec![ true, false, true, false, false, false, ])); @@ -1018,7 +1014,7 @@ mod tests { pub fn test_write_batch_time_range_const_vector() { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4, 5, 6])); let tsv = Arc::new(ConstantVector::new( - Arc::new(TimestampVector::from_vec(vec![20])), + Arc::new(TimestampMillisecondVector::from_vec(vec![20])), 6, )); let boolv = Arc::new(BooleanVector::from(vec![ @@ -1049,7 +1045,7 @@ mod tests { for i in 0..10 { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); let boolv = Arc::new(BooleanVector::from(vec![Some(true), Some(false), None])); - let tsv = Arc::new(TimestampVector::from_vec(vec![i, i, i])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![i, i, i])); let mut put_data = PutData::new(); put_data.add_key_column("k1", intv.clone()).unwrap(); @@ -1103,7 +1099,7 @@ mod tests { let mut batch = new_test_batch(); for _ in 0..10 { let intv = Arc::new(UInt64Vector::from_slice(&[1, 2, 3])); - let tsv = Arc::new(TimestampVector::from_vec(vec![0, 0, 0])); + let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])); let mut put_data = PutData::new(); put_data.add_key_column("k1", intv.clone()).unwrap(); diff --git a/src/storage/src/write_batch/compat.rs b/src/storage/src/write_batch/compat.rs index dcd9d155c3..ce45ffc1db 100644 --- a/src/storage/src/write_batch/compat.rs +++ b/src/storage/src/write_batch/compat.rs @@ -99,7 +99,7 @@ mod tests { use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnDefaultConstraint, SchemaBuilder}; - use datatypes::vectors::{Int32Vector, TimestampVector}; + use datatypes::vectors::{Int32Vector, TimestampMillisecondVector}; use store_api::storage::{PutOperation, WriteRequest}; use super::*; @@ -110,8 +110,12 @@ mod tests { ) -> SchemaBuilder { let mut column_schemas = vec![ ColumnSchema::new("k0", ConcreteDataType::int32_datatype(), false), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; if let Some(v0_constraint) = v0_constraint { @@ -134,7 +138,7 @@ mod tests { fn new_put_data() -> PutData { let mut put_data = PutData::new(); let k0 = Arc::new(Int32Vector::from_slice(&[1, 2, 3])); - let ts = Arc::new(TimestampVector::from_values([11, 12, 13])); + let ts = Arc::new(TimestampMillisecondVector::from_values([11, 12, 13])); put_data.add_key_column("k0", k0).unwrap(); put_data.add_key_column("ts", ts).unwrap(); diff --git a/src/store-api/src/storage/chunk.rs b/src/store-api/src/storage/chunk.rs index ca7a8e1736..32fedc2df1 100644 --- a/src/store-api/src/storage/chunk.rs +++ b/src/store-api/src/storage/chunk.rs @@ -19,6 +19,7 @@ use datatypes::vectors::VectorRef; use crate::storage::SchemaRef; /// A bunch of rows in columnar format. +#[derive(Debug)] pub struct Chunk { pub columns: Vec, // TODO(yingwen): Sequences. diff --git a/src/store-api/src/storage/descriptors.rs b/src/store-api/src/storage/descriptors.rs index 9942239286..b31159b1bd 100644 --- a/src/store-api/src/storage/descriptors.rs +++ b/src/store-api/src/storage/descriptors.rs @@ -27,7 +27,7 @@ pub type RegionId = u64; pub type RegionNumber = u32; /// A [ColumnDescriptor] contains information to create a column. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Builder)] #[builder(pattern = "owned", build_fn(validate = "Self::validate"))] pub struct ColumnDescriptor { pub id: ColumnId, @@ -107,7 +107,7 @@ impl ColumnDescriptorBuilder { } /// A [RowKeyDescriptor] contains information about row key. -#[derive(Debug, Clone, PartialEq, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct RowKeyDescriptor { #[builder(default, setter(each(name = "push_column")))] @@ -122,7 +122,7 @@ pub struct RowKeyDescriptor { } /// A [ColumnFamilyDescriptor] contains information to create a column family. -#[derive(Debug, Clone, PartialEq, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct ColumnFamilyDescriptor { #[builder(default = "consts::DEFAULT_CF_ID")] @@ -135,7 +135,7 @@ pub struct ColumnFamilyDescriptor { } /// A [RegionDescriptor] contains information to create a region. -#[derive(Debug, Clone, PartialEq, Builder)] +#[derive(Debug, Clone, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct RegionDescriptor { pub id: RegionId, diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 8e7cebb40d..59d9a901bd 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -12,10 +12,9 @@ common-error = { path = "../common/error" } common-query = { path = "../common/query" } common-recordbatch = { path = "../common/recordbatch" } common-telemetry = { path = "../common/telemetry" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = [ - "simd", -] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion = "14.0.0" +datafusion-common = "14.0.0" +datafusion-expr = "14.0.0" datatypes = { path = "../datatypes" } derive_builder = "0.11" futures = "0.3" @@ -27,6 +26,7 @@ store-api = { path = "../store-api" } tokio = { version = "1.18", features = ["full"] } [dev-dependencies] -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" } +datafusion-expr = "14.0.0" tempdir = "0.3" tokio-util = { version = "0.7", features = ["compat"] } +parquet = { version = "26", features = ["async"] } diff --git a/src/table/src/error.rs b/src/table/src/error.rs index ed18c471ce..3605ab0a1a 100644 --- a/src/table/src/error.rs +++ b/src/table/src/error.rs @@ -152,7 +152,9 @@ impl From for DataFusionError { impl From for RecordBatchError { fn from(e: InnerError) -> RecordBatchError { - RecordBatchError::new(e) + RecordBatchError::External { + source: BoxedError::new(e), + } } } @@ -173,7 +175,7 @@ mod tests { } fn throw_arrow() -> Result<()> { - Err(ArrowError::Overflow).context(PollStreamSnafu)? + Err(ArrowError::ComputeError("Overflow".to_string())).context(PollStreamSnafu)? } #[test] diff --git a/src/table/src/metadata.rs b/src/table/src/metadata.rs index 2e0f722352..e481acbc7e 100644 --- a/src/table/src/metadata.rs +++ b/src/table/src/metadata.rs @@ -68,7 +68,7 @@ pub struct TableIdent { pub version: TableVersion, } -#[derive(Clone, Debug, Builder, PartialEq)] +#[derive(Clone, Debug, Builder, PartialEq, Eq)] #[builder(pattern = "mutable")] pub struct TableMeta { pub schema: SchemaRef, @@ -322,7 +322,7 @@ impl TableMeta { } } -#[derive(Clone, Debug, PartialEq, Builder)] +#[derive(Clone, Debug, PartialEq, Eq, Builder)] #[builder(pattern = "owned")] pub struct TableInfo { /// Id and version of the table. @@ -383,7 +383,7 @@ impl From for TableIdent { } /// Struct used to serialize and deserialize [`TableMeta`]. -#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct RawTableMeta { pub schema: RawSchema, pub primary_key_indices: Vec, @@ -431,7 +431,7 @@ impl TryFrom for TableMeta { } /// Struct used to serialize and deserialize [`TableInfo`]. -#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct RawTableInfo { pub ident: TableIdent, pub name: String, @@ -483,8 +483,12 @@ mod tests { fn new_test_schema() -> Schema { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ColumnSchema::new("col2", ConcreteDataType::int32_datatype(), true), ]; SchemaBuilder::try_from(column_schemas) @@ -607,8 +611,12 @@ mod tests { ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), ColumnSchema::new("col2", ConcreteDataType::int32_datatype(), true), ColumnSchema::new("col3", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = Arc::new( SchemaBuilder::try_from(column_schemas) diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs index 64d32d57f4..6e61415cbe 100644 --- a/src/table/src/predicate.rs +++ b/src/table/src/predicate.rs @@ -16,8 +16,8 @@ mod stats; use common_query::logical_plan::Expr; use common_telemetry::{error, warn}; +use datafusion::parquet::file::metadata::RowGroupMetaData; use datafusion::physical_optimizer::pruning::PruningPredicate; -use datatypes::arrow::io::parquet::read::RowGroupMetaData; use datatypes::schema::SchemaRef; use crate::predicate::stats::RowGroupPruningStatistics; @@ -70,19 +70,17 @@ impl Predicate { mod tests { use std::sync::Arc; - pub use datafusion::parquet::schema::types::{BasicTypeInfo, PhysicalType}; - use datafusion_common::Column; - use datafusion_expr::{Expr, Literal, Operator}; - use datatypes::arrow::array::{Int32Array, Utf8Array}; - use datatypes::arrow::chunk::Chunk; + use datafusion::parquet::arrow::ArrowWriter; + pub use datafusion::parquet::schema::types::BasicTypeInfo; + use datafusion_common::{Column, ScalarValue}; + use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; + use datatypes::arrow::array::Int32Array; use datatypes::arrow::datatypes::{DataType, Field, Schema}; - use datatypes::arrow::io::parquet::read::FileReader; - use datatypes::arrow::io::parquet::write::{ - Compression, Encoding, FileSink, Version, WriteOptions, - }; - use futures::{AsyncWriteExt, SinkExt}; + use datatypes::arrow::record_batch::RecordBatch; + use datatypes::arrow_array::StringArray; + use parquet::arrow::ParquetRecordBatchStreamBuilder; + use parquet::file::properties::WriterProperties; use tempdir::TempDir; - use tokio_util::compat::TokioAsyncWriteCompatExt; use super::*; @@ -95,80 +93,62 @@ mod tests { let name_field = Field::new("name", DataType::Utf8, true); let count_field = Field::new("cnt", DataType::Int32, true); + let schema = Arc::new(Schema::new(vec![name_field, count_field])); - let schema = Schema::from(vec![name_field, count_field]); - - // now all physical types use plain encoding, maybe let caller to choose encoding for each type. - let encodings = vec![Encoding::Plain].repeat(schema.fields.len()); - - let mut writer = tokio::fs::OpenOptions::new() + let file = std::fs::OpenOptions::new() .write(true) .create(true) - .open(&path) - .await - .unwrap() - .compat_write(); + .open(path.clone()) + .unwrap(); - let mut sink = FileSink::try_new( - &mut writer, - schema.clone(), - encodings, - WriteOptions { - write_statistics: true, - compression: Compression::Gzip, - version: Version::V2, - }, - ) - .unwrap(); + let write_props = WriterProperties::builder() + .set_max_row_group_size(10) + .build(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(write_props)).unwrap(); for i in (0..cnt).step_by(10) { - let name_array = Utf8Array::::from( - &(i..(i + 10).min(cnt)) - .map(|i| Some(i.to_string())) + let name_array = Arc::new(StringArray::from( + (i..(i + 10).min(cnt)) + .map(|i| i.to_string()) .collect::>(), - ); - let count_array = Int32Array::from( - &(i..(i + 10).min(cnt)) - .map(|i| Some(i as i32)) - .collect::>(), - ); - - sink.send(Chunk::new(vec![ - Arc::new(name_array), - Arc::new(count_array), - ])) - .await - .unwrap(); + )) as Arc<_>; + let count_array = Arc::new(Int32Array::from( + (i..(i + 10).min(cnt)).map(|i| i as i32).collect::>(), + )) as Arc<_>; + let rb = RecordBatch::try_new(schema.clone(), vec![name_array, count_array]).unwrap(); + writer.write(&rb).unwrap(); } - sink.close().await.unwrap(); - - drop(sink); - writer.flush().await.unwrap(); - - (path, Arc::new(schema)) + writer.close().unwrap(); + (path, schema) } async fn assert_prune(array_cnt: usize, predicate: Predicate, expect: Vec) { let dir = TempDir::new("prune_parquet").unwrap(); let (path, schema) = gen_test_parquet_file(&dir, array_cnt).await; - let file_reader = - FileReader::try_new(std::fs::File::open(path).unwrap(), None, None, None, None) - .unwrap(); - let schema = Arc::new(datatypes::schema::Schema::try_from(schema).unwrap()); - - let vec = file_reader.metadata().row_groups.clone(); - let res = predicate.prune_row_groups(schema, &vec); + let builder = ParquetRecordBatchStreamBuilder::new( + tokio::fs::OpenOptions::new() + .read(true) + .open(path) + .await + .unwrap(), + ) + .await + .unwrap(); + let metadata = builder.metadata().clone(); + let row_groups = metadata.row_groups(); + let res = predicate.prune_row_groups(schema, row_groups); assert_eq!(expect, res); } fn gen_predicate(max_val: i32, op: Operator) -> Predicate { - Predicate::new(vec![Expr::BinaryExpr { - left: Box::new(Expr::Column(Column::from_name("cnt".to_string()))), - op, - right: Box::new(max_val.lit()), - } - .into()]) + Predicate::new(vec![common_query::logical_plan::Expr::from( + Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column(Column::from_name("cnt"))), + op, + right: Box::new(Expr::Literal(ScalarValue::Int32(Some(max_val)))), + }), + )]) } #[tokio::test] diff --git a/src/table/src/predicate/stats.rs b/src/table/src/predicate/stats.rs index b474eddeb1..f092cd5418 100644 --- a/src/table/src/predicate/stats.rs +++ b/src/table/src/predicate/stats.rs @@ -12,17 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion::parquet::metadata::RowGroupMetaData; -use datafusion::parquet::statistics::{ - BinaryStatistics, BooleanStatistics, FixedLenStatistics, PrimitiveStatistics, -}; +use std::sync::Arc; + +use datafusion::parquet::file::metadata::RowGroupMetaData; +use datafusion::parquet::file::statistics::Statistics as ParquetStats; use datafusion::physical_optimizer::pruning::PruningStatistics; use datafusion_common::{Column, ScalarValue}; -use datatypes::arrow::array::ArrayRef; +use datatypes::arrow::array::{ArrayRef, UInt64Array}; use datatypes::arrow::datatypes::DataType; -use datatypes::arrow::io::parquet::read::PhysicalType; -use datatypes::prelude::Vector; -use datatypes::vectors::Int64Vector; use paste::paste; pub struct RowGroupPruningStatistics<'a> { @@ -40,92 +37,58 @@ impl<'a> RowGroupPruningStatistics<'a> { fn field_by_name(&self, name: &str) -> Option<(usize, &DataType)> { let idx = self.schema.column_index_by_name(name)?; - let data_type = &self.schema.arrow_schema().fields.get(idx)?.data_type; + let data_type = &self.schema.arrow_schema().fields.get(idx)?.data_type(); Some((idx, data_type)) } } macro_rules! impl_min_max_values { - ($self:ident, $col:ident, $min_max: ident) => { - paste! { - { - let (column_index, data_type) = $self.field_by_name(&$col.name)?; - let null_scalar: ScalarValue = data_type.try_into().ok()?; - let scalar_values: Vec = $self - .meta_data - .iter() - .flat_map(|meta| meta.column(column_index).statistics()) - .map(|stats| { - let stats = stats.ok()?; - let res = match stats.physical_type() { - PhysicalType::Boolean => { - let $min_max = stats.as_any().downcast_ref::().unwrap().[<$min_max _value>]; - Some(ScalarValue::Boolean($min_max)) - } - PhysicalType::Int32 => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Int32($min_max)) - } - PhysicalType::Int64 => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Int64($min_max)) - } - PhysicalType::Int96 => { - // INT96 currently not supported - None - } - PhysicalType::Float => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Float32($min_max)) - } - PhysicalType::Double => { - let $min_max = stats - .as_any() - .downcast_ref::>() - .unwrap() - .[<$min_max _value>]; - Some(ScalarValue::Float64($min_max)) - } - PhysicalType::ByteArray => { - let $min_max = stats - .as_any() - .downcast_ref::() - .unwrap() - .[<$min_max _value>] - .clone(); - Some(ScalarValue::Binary($min_max)) - } - PhysicalType::FixedLenByteArray(_) => { - let $min_max = stats - .as_any() - .downcast_ref::() - .unwrap() - .[<$min_max _value>] - .clone(); - Some(ScalarValue::Binary($min_max)) - } - }; + ($self:ident, $col:ident, $min_max: ident) => {{ + let arrow_schema = $self.schema.arrow_schema().clone(); + let (column_index, field) = if let Some((v, f)) = arrow_schema.column_with_name(&$col.name) + { + (v, f) + } else { + return None; + }; + let data_type = field.data_type(); + let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() { + v + } else { + return None; + }; - res - }) - .map(|maybe_scalar| maybe_scalar.unwrap_or_else(|| null_scalar.clone())) - .collect::>(); - ScalarValue::iter_to_array(scalar_values).ok() - } - } - }; + let scalar_values = $self + .meta_data + .iter() + .map(|meta| { + let stats = meta.column(column_index).statistics()?; + if !stats.has_min_max_set() { + return None; + } + match stats { + ParquetStats::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$min_max()))), + ParquetStats::Int32(s) => Some(ScalarValue::Int32(Some(*s.$min_max()))), + ParquetStats::Int64(s) => Some(ScalarValue::Int64(Some(*s.$min_max()))), + + ParquetStats::Int96(_) => None, + ParquetStats::Float(s) => Some(ScalarValue::Float32(Some(*s.$min_max()))), + ParquetStats::Double(s) => Some(ScalarValue::Float64(Some(*s.$min_max()))), + ParquetStats::ByteArray(s) => { + paste! { + let s = String::from_utf8(s.[<$min_max _bytes>]().to_owned()).ok(); + } + Some(ScalarValue::Utf8(s)) + } + + ParquetStats::FixedLenByteArray(_) => None, + } + }) + .map(|maybe_scalar| maybe_scalar.unwrap_or_else(|| null_scalar.clone())) + .collect::>(); + debug_assert_eq!(scalar_values.len(), $self.meta_data.len()); + ScalarValue::iter_to_array(scalar_values).ok() + }}; } impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { @@ -143,14 +106,13 @@ impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { fn null_counts(&self, column: &Column) -> Option { let (idx, _) = self.field_by_name(&column.name)?; - let mut values: Vec> = Vec::with_capacity(self.meta_data.len()); + let mut values: Vec> = Vec::with_capacity(self.meta_data.len()); for m in self.meta_data { let col = m.column(idx); - let stat = col.statistics()?.ok()?; + let stat = col.statistics()?; let bs = stat.null_count(); - values.push(bs); + values.push(Some(bs)); } - - Some(Int64Vector::from(values).to_arrow_array()) + Some(Arc::new(UInt64Array::from(values))) } } diff --git a/src/table/src/table/adapter.rs b/src/table/src/table/adapter.rs index 32824e7a49..98ff82d08a 100644 --- a/src/table/src/table/adapter.rs +++ b/src/table/src/table/adapter.rs @@ -23,7 +23,9 @@ use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef; use datafusion::datasource::datasource::TableProviderFilterPushDown as DfTableProviderFilterPushDown; use datafusion::datasource::{TableProvider, TableType as DfTableType}; use datafusion::error::Result as DfResult; -use datafusion::logical_plan::Expr as DfExpr; +use datafusion::execution::context::SessionState; +use datafusion::prelude::SessionContext; +use datafusion_expr::expr::Expr as DfExpr; use datatypes::schema::{SchemaRef as TableSchemaRef, SchemaRef}; use snafu::prelude::*; @@ -66,6 +68,7 @@ impl TableProvider for DfTableProviderAdapter { async fn scan( &self, + _ctx: &SessionState, projection: &Option>, filters: &[DfExpr], limit: Option, @@ -135,11 +138,12 @@ impl Table for TableAdapter { filters: &[Expr], limit: Option, ) -> Result { + let ctx = SessionContext::new(); let filters: Vec = filters.iter().map(|e| e.df_expr().clone()).collect(); debug!("TableScan filter size: {}", filters.len()); let execution_plan = self .table_provider - .scan(projection, &filters, limit) + .scan(&ctx.state(), projection, &filters, limit) .await .context(error::DatafusionSnafu)?; let schema: SchemaRef = Arc::new( @@ -168,7 +172,6 @@ impl Table for TableAdapter { mod tests { use datafusion::arrow; use datafusion::datasource::empty::EmptyTable; - use datafusion_common::field_util::SchemaExt; use super::*; use crate::metadata::TableType::Base; diff --git a/src/table/src/table/numbers.rs b/src/table/src/table/numbers.rs index db33769c31..7664d8f0fd 100644 --- a/src/table/src/table/numbers.rs +++ b/src/table/src/table/numbers.rs @@ -19,7 +19,8 @@ use std::sync::Arc; use common_query::physical_plan::PhysicalPlanRef; use common_recordbatch::error::Result as RecordBatchResult; use common_recordbatch::{RecordBatch, RecordBatchStream}; -use datafusion_common::record_batch::RecordBatch as DfRecordBatch; +use datafusion::arrow::record_batch::RecordBatch as DfRecordBatch; +use datafusion_common::from_slice::FromSlice; use datatypes::arrow::array::UInt32Array; use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, SchemaBuilder, SchemaRef}; @@ -139,9 +140,9 @@ impl Stream for NumbersStream { ) .unwrap(); - Poll::Ready(Some(Ok(RecordBatch { - schema: self.schema.clone(), - df_recordbatch: batch, - }))) + Poll::Ready(Some(RecordBatch::try_from_df_record_batch( + self.schema.clone(), + batch, + ))) } } diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index 4e1ef884e7..b9078befa8 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -18,8 +18,9 @@ use std::sync::{Arc, Mutex}; use common_query::error as query_error; use common_query::error::Result as QueryResult; -use common_query::physical_plan::{Partitioning, PhysicalPlan, PhysicalPlanRef, RuntimeEnv}; +use common_query::physical_plan::{Partitioning, PhysicalPlan, PhysicalPlanRef}; use common_recordbatch::SendableRecordBatchStream; +use datafusion::execution::context::TaskContext; use datatypes::schema::SchemaRef; use snafu::OptionExt; @@ -71,16 +72,17 @@ impl PhysicalPlan for SimpleTableScan { fn execute( &self, _partition: usize, - _runtime: Arc, + _context: Arc, ) -> QueryResult { let mut stream = self.stream.lock().unwrap(); - Ok(stream.take().context(query_error::ExecuteRepeatedlySnafu)?) + stream.take().context(query_error::ExecuteRepeatedlySnafu) } } #[cfg(test)] mod test { use common_recordbatch::{util, RecordBatch, RecordBatches}; + use datafusion::prelude::SessionContext; use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; use datatypes::vectors::Int32Vector; @@ -89,6 +91,7 @@ mod test { #[tokio::test] async fn test_simple_table_scan() { + let ctx = SessionContext::new(); let schema = Arc::new(Schema::new(vec![ColumnSchema::new( "a", ConcreteDataType::int32_datatype(), @@ -114,13 +117,12 @@ mod test { assert_eq!(scan.schema(), schema); - let runtime = Arc::new(RuntimeEnv::default()); - let stream = scan.execute(0, runtime.clone()).unwrap(); + let stream = scan.execute(0, ctx.task_ctx()).unwrap(); let recordbatches = util::collect(stream).await.unwrap(); assert_eq!(recordbatches[0], batch1); assert_eq!(recordbatches[1], batch2); - let result = scan.execute(0, runtime); + let result = scan.execute(0, ctx.task_ctx()); assert!(result.is_err()); match result { Err(e) => assert!(e diff --git a/src/table/src/test_util/memtable.rs b/src/table/src/test_util/memtable.rs index 5f35e73c82..c0cd028f45 100644 --- a/src/table/src/test_util/memtable.rs +++ b/src/table/src/test_util/memtable.rs @@ -17,6 +17,7 @@ use std::pin::Pin; use std::sync::Arc; use async_trait::async_trait; +use common_error::prelude::BoxedError; use common_query::physical_plan::PhysicalPlanRef; use common_query::prelude::Expr; use common_recordbatch::error::Result as RecordBatchResult; @@ -29,7 +30,7 @@ use futures::Stream; use snafu::prelude::*; use store_api::storage::RegionNumber; -use crate::error::{Result, SchemaConversionSnafu, TableProjectionSnafu}; +use crate::error::{Result, SchemaConversionSnafu, TableProjectionSnafu, TablesRecordBatchSnafu}; use crate::metadata::{ TableId, TableInfoBuilder, TableInfoRef, TableMetaBuilder, TableType, TableVersion, }; @@ -145,11 +146,11 @@ impl Table for MemTable { ) -> Result { let df_recordbatch = if let Some(indices) = projection { self.recordbatch - .df_recordbatch + .df_record_batch() .project(indices) .context(TableProjectionSnafu)? } else { - self.recordbatch.df_recordbatch.clone() + self.recordbatch.df_record_batch().clone() }; let rows = df_recordbatch.num_rows(); @@ -160,12 +161,12 @@ impl Table for MemTable { }; let df_recordbatch = df_recordbatch.slice(0, limit); - let recordbatch = RecordBatch { - schema: Arc::new( - Schema::try_from(df_recordbatch.schema().clone()).context(SchemaConversionSnafu)?, - ), + let recordbatch = RecordBatch::try_from_df_record_batch( + Arc::new(Schema::try_from(df_recordbatch.schema()).context(SchemaConversionSnafu)?), df_recordbatch, - }; + ) + .map_err(BoxedError::new) + .context(TablesRecordBatchSnafu)?; Ok(Arc::new(SimpleTableScan::new(Box::pin(MemtableStream { schema: recordbatch.schema.clone(), recordbatch: Some(recordbatch), @@ -197,28 +198,27 @@ impl Stream for MemtableStream { #[cfg(test)] mod test { - use common_query::physical_plan::RuntimeEnv; use common_recordbatch::util; + use datafusion::prelude::SessionContext; use datatypes::prelude::*; use datatypes::schema::ColumnSchema; - use datatypes::vectors::{Int32Vector, StringVector}; + use datatypes::vectors::{Helper, Int32Vector, StringVector}; use super::*; #[tokio::test] async fn test_scan_with_projection() { + let ctx = SessionContext::new(); let table = build_testing_table(); let scan_stream = table.scan(&Some(vec![1]), &[], None).await.unwrap(); - let scan_stream = scan_stream - .execute(0, Arc::new(RuntimeEnv::default())) - .unwrap(); + let scan_stream = scan_stream.execute(0, ctx.task_ctx()).unwrap(); let recordbatch = util::collect(scan_stream).await.unwrap(); assert_eq!(1, recordbatch.len()); - let columns = recordbatch[0].df_recordbatch.columns(); + let columns = recordbatch[0].df_record_batch().columns(); assert_eq!(1, columns.len()); - let string_column = VectorHelper::try_into_vector(&columns[0]).unwrap(); + let string_column = Helper::try_into_vector(&columns[0]).unwrap(); let string_column = string_column .as_any() .downcast_ref::() @@ -229,23 +229,22 @@ mod test { #[tokio::test] async fn test_scan_with_limit() { + let ctx = SessionContext::new(); let table = build_testing_table(); let scan_stream = table.scan(&None, &[], Some(2)).await.unwrap(); - let scan_stream = scan_stream - .execute(0, Arc::new(RuntimeEnv::default())) - .unwrap(); + let scan_stream = scan_stream.execute(0, ctx.task_ctx()).unwrap(); let recordbatch = util::collect(scan_stream).await.unwrap(); assert_eq!(1, recordbatch.len()); - let columns = recordbatch[0].df_recordbatch.columns(); + let columns = recordbatch[0].df_record_batch().columns(); assert_eq!(2, columns.len()); - let i32_column = VectorHelper::try_into_vector(&columns[0]).unwrap(); + let i32_column = Helper::try_into_vector(&columns[0]).unwrap(); let i32_column = i32_column.as_any().downcast_ref::().unwrap(); let i32_column = i32_column.iter_data().flatten().collect::>(); assert_eq!(vec![-100], i32_column); - let string_column = VectorHelper::try_into_vector(&columns[1]).unwrap(); + let string_column = Helper::try_into_vector(&columns[1]).unwrap(); let string_column = string_column .as_any() .downcast_ref::() diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index 958bcf2fb8..6c77b28d52 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -225,7 +225,7 @@ pub async fn setup_test_app(store_type: StorageType, name: &str) -> (Router, Tes create_test_table( instance.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); @@ -244,7 +244,7 @@ pub async fn setup_test_app_with_frontend( create_test_table( frontend.catalog_manager(), instance.sql_handler(), - ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::timestamp_millisecond_datatype(), ) .await .unwrap(); diff --git a/tests-integration/tests/grpc.rs b/tests-integration/tests/grpc.rs index 6f94aff3e5..7ebce04509 100644 --- a/tests-integration/tests/grpc.rs +++ b/tests-integration/tests/grpc.rs @@ -109,11 +109,11 @@ fn expect_data() -> (Column, Column, Column, Column) { let expected_ts_col = Column { column_name: "ts".to_string(), values: Some(column::Values { - ts_millis_values: vec![100, 101, 102, 103], + ts_millisecond_values: vec![100, 101, 102, 103], ..Default::default() }), semantic_type: SemanticType::Timestamp as i32, - datatype: ColumnDataType::Timestamp as i32, + datatype: ColumnDataType::TimestampMillisecond as i32, ..Default::default() }; @@ -244,7 +244,7 @@ fn testing_create_expr() -> CreateExpr { }, ColumnDef { name: "ts".to_string(), - datatype: 15, // timestamp + datatype: ColumnDataType::TimestampMillisecond as i32, // timestamp is_nullable: true, default_constraint: None, }, diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 8d074bba67..267c49e824 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -116,7 +116,7 @@ pub async fn test_sql_api(store_type: StorageType) { assert_eq!( output[0], serde_json::from_value::(json!({ - "records":{"schema":{"column_schemas":[{"name":"host","data_type":"String"},{"name":"cpu","data_type":"Float64"},{"name":"memory","data_type":"Float64"},{"name":"ts","data_type":"Timestamp"}]},"rows":[["host",66.6,1024.0,0]]} + "records":{"schema":{"column_schemas":[{"name":"host","data_type":"String"},{"name":"cpu","data_type":"Float64"},{"name":"memory","data_type":"Float64"},{"name":"ts","data_type":"TimestampMillisecond"}]},"rows":[["host",66.6,1024.0,0]]} })).unwrap() ); @@ -138,7 +138,7 @@ pub async fn test_sql_api(store_type: StorageType) { assert_eq!( output[0], serde_json::from_value::(json!({ - "records":{"schema":{"column_schemas":[{"name":"cpu","data_type":"Float64"},{"name":"ts","data_type":"Timestamp"}]},"rows":[[66.6,0]]} + "records":{"schema":{"column_schemas":[{"name":"cpu","data_type":"Float64"},{"name":"ts","data_type":"TimestampMillisecond"}]},"rows":[[66.6,0]]} })).unwrap() ); @@ -159,7 +159,7 @@ pub async fn test_sql_api(store_type: StorageType) { assert_eq!( output[0], serde_json::from_value::(json!({ - "records":{"schema":{"column_schemas":[{"name":"c","data_type":"Float64"},{"name":"time","data_type":"Timestamp"}]},"rows":[[66.6,0]]} + "records":{"schema":{"column_schemas":[{"name":"c","data_type":"Float64"},{"name":"time","data_type":"TimestampMillisecond"}]},"rows":[[66.6,0]]} })).unwrap() ); diff --git a/tests/runner/src/util.rs b/tests/runner/src/util.rs index a6accc9ed7..6c42d4391d 100644 --- a/tests/runner/src/util.rs +++ b/tests/runner/src/util.rs @@ -98,8 +98,23 @@ pub fn values_to_string(data_type: ColumnDataType, values: Values) -> Vec values - .ts_millis_values + ColumnDataType::TimestampSecond => values + .ts_second_values + .into_iter() + .map(|v| v.to_string()) + .collect(), + ColumnDataType::TimestampMillisecond => values + .ts_millisecond_values + .into_iter() + .map(|v| v.to_string()) + .collect(), + ColumnDataType::TimestampMicrosecond => values + .ts_microsecond_values + .into_iter() + .map(|v| v.to_string()) + .collect(), + ColumnDataType::TimestampNanosecond => values + .ts_nanosecond_values .into_iter() .map(|v| v.to_string()) .collect(),