Compare commits

...

6 Commits

Author SHA1 Message Date
discord9
1afcddd5a9 chore: feature gate vector_index (#7428)
Signed-off-by: discord9 <discord9@163.com>
2025-12-17 07:14:25 +00:00
shuiyisong
62808b887b fix: using anonymous s3 access when ak and sk is not provided (#7425)
* chore: allow s3 anon

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: disable ec2 metadata

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-12-17 06:34:29 +00:00
discord9
04ddd40e00 chore: bump version to beta.3 (#7423)
chore: bump to beta.3

Signed-off-by: discord9 <discord9@163.com>
2025-12-17 04:18:23 +00:00
liyang
b4f028be5f chore: change etcd endpoints to array in the test scripts (#7419)
chore: change etcd endpoint

Signed-off-by: liyang <daviderli614@gmail.com>
2025-12-17 03:14:35 +00:00
Lei, HUANG
da964880f5 chore: expose symbols (#7417)
* refactor/expose-symbols:
 ## Refactor `bulk/part.rs` to Simplify Mutation Handling

 - Removed the `mutations_to_record_batch` function and its associated helper functions, including `ArraysSorter`, `timestamp_array_to_iter`, and `binary_array_to_dictionary`, to simplify the mutation handling logic in `bulk/part.rs`.
 - Deleted related test functions `check_binary_array_to_dictionary` and `check_mutations_to_record_batches` from the test module, along with their associated test cases.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* refactor/expose-symbols:
 ### Commit Message

 **Refactor and Enhance Deduplication Logic**

 - **`flush.rs`**: Refactored `maybe_dedup_one` function to accept `append_mode` and `merge_mode` as parameters instead of `RegionOptions`. This change enhances flexibility in deduplication logic.
 - **`memtable/bulk.rs`**: Made `BulkRangeIterBuilder` struct and its fields public to allow external access and modification, improving extensibility.
 - **`sst.rs`**: Corrected a typo in the schema documentation, changing `__prmary_key` to `__primary_key` for clarity and accuracy.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-12-17 01:29:36 +00:00
dennis zhuang
a35a39f726 feat(vector_index): adds the foundational types and SQL parsing support for vector index (#7366)
* feat: adds the foundational types and SQL parsing support for vector index

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* refactor: by suggestions

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* fix: ensure index option values must be greater than zero

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: validate connectivity strictly

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* fix: compile error

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* feat: disable SIMD for ci

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
2025-12-16 22:45:36 +00:00
23 changed files with 1228 additions and 657 deletions

View File

@@ -51,7 +51,7 @@ runs:
run: |
helm upgrade \
--install my-greptimedb \
--set meta.backendStorage.etcd.endpoints=${{ inputs.etcd-endpoints }} \
--set 'meta.backendStorage.etcd.endpoints[0]=${{ inputs.etcd-endpoints }}' \
--set meta.enableRegionFailover=${{ inputs.enable-region-failover }} \
--set image.registry=${{ inputs.image-registry }} \
--set image.repository=${{ inputs.image-repository }} \

View File

@@ -81,7 +81,7 @@ function deploy_greptimedb_cluster() {
--create-namespace \
--set image.tag="$GREPTIMEDB_IMAGE_TAG" \
--set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \
--set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
--set "meta.backendStorage.etcd.endpoints[0]=etcd.$install_namespace.svc.cluster.local:2379" \
--set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
-n "$install_namespace"
@@ -119,7 +119,7 @@ function deploy_greptimedb_cluster_with_s3_storage() {
--create-namespace \
--set image.tag="$GREPTIMEDB_IMAGE_TAG" \
--set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \
--set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \
--set "meta.backendStorage.etcd.endpoints[0]=etcd.$install_namespace.svc.cluster.local:2379" \
--set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \
--set objectStorage.s3.bucket="$AWS_CI_TEST_BUCKET" \
--set objectStorage.s3.region="$AWS_REGION" \

263
Cargo.lock generated
View File

@@ -212,7 +212,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
[[package]]
name = "api"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"arrow-schema",
"common-base",
@@ -733,7 +733,7 @@ dependencies = [
[[package]]
name = "auth"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -1383,7 +1383,7 @@ dependencies = [
[[package]]
name = "cache"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"catalog",
"common-error",
@@ -1418,7 +1418,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arrow",
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
[[package]]
name = "cli"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-stream",
"async-trait",
@@ -1816,7 +1816,7 @@ dependencies = [
[[package]]
name = "client"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arc-swap",
@@ -1849,7 +1849,7 @@ dependencies = [
"snafu 0.8.6",
"store-api",
"substrait 0.37.3",
"substrait 1.0.0-beta.2",
"substrait 1.0.0-beta.3",
"tokio",
"tokio-stream",
"tonic 0.13.1",
@@ -1889,7 +1889,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"auth",
@@ -1977,6 +1977,17 @@ dependencies = [
"unicode-width 0.2.1",
]
[[package]]
name = "codespan-reporting"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
dependencies = [
"serde",
"termcolor",
"unicode-width 0.2.1",
]
[[package]]
name = "colorchoice"
version = "1.0.4"
@@ -2012,7 +2023,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"anymap2",
"async-trait",
@@ -2036,14 +2047,14 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"const_format",
]
[[package]]
name = "common-config"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-base",
"common-error",
@@ -2068,7 +2079,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"arrow",
"arrow-schema",
@@ -2103,7 +2114,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"bigdecimal 0.4.8",
"common-error",
@@ -2116,7 +2127,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-macro",
"http 1.3.1",
@@ -2127,7 +2138,7 @@ dependencies = [
[[package]]
name = "common-event-recorder"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -2149,7 +2160,7 @@ dependencies = [
[[package]]
name = "common-frontend"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -2171,7 +2182,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"api",
@@ -2231,7 +2242,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"common-runtime",
@@ -2248,7 +2259,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arrow-flight",
@@ -2283,7 +2294,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"common-base",
@@ -2303,7 +2314,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"greptime-proto",
"once_cell",
@@ -2314,7 +2325,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"anyhow",
"common-error",
@@ -2330,7 +2341,7 @@ dependencies = [
[[package]]
name = "common-memory-manager"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-error",
"common-macro",
@@ -2343,7 +2354,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"anymap2",
"api",
@@ -2415,7 +2426,7 @@ dependencies = [
[[package]]
name = "common-options"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-grpc",
"humantime-serde",
@@ -2424,11 +2435,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
[[package]]
name = "common-pprof"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-error",
"common-macro",
@@ -2440,7 +2451,7 @@ dependencies = [
[[package]]
name = "common-procedure"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-stream",
@@ -2469,7 +2480,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"common-procedure",
@@ -2479,7 +2490,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -2505,7 +2516,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"arc-swap",
"common-base",
@@ -2529,7 +2540,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"clap 4.5.40",
@@ -2558,7 +2569,7 @@ dependencies = [
[[package]]
name = "common-session"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"serde",
"strum 0.27.1",
@@ -2566,7 +2577,7 @@ dependencies = [
[[package]]
name = "common-sql"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-base",
"common-decimal",
@@ -2584,7 +2595,7 @@ dependencies = [
[[package]]
name = "common-stat"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-base",
"common-runtime",
@@ -2599,7 +2610,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"backtrace",
"common-base",
@@ -2628,7 +2639,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"client",
"common-grpc",
@@ -2641,7 +2652,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"arrow",
"chrono",
@@ -2659,7 +2670,7 @@ dependencies = [
[[package]]
name = "common-version"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"build-data",
"cargo-manifest",
@@ -2670,7 +2681,7 @@ dependencies = [
[[package]]
name = "common-wal"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-base",
"common-error",
@@ -2693,7 +2704,7 @@ dependencies = [
[[package]]
name = "common-workload"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"common-telemetry",
"serde",
@@ -3169,6 +3180,68 @@ dependencies = [
"cipher",
]
[[package]]
name = "cxx"
version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7620f6cfc4dcca21f2b085b7a890e16c60fd66f560cd69ee60594908dc72ab1"
dependencies = [
"cc",
"cxx-build",
"cxxbridge-cmd",
"cxxbridge-flags",
"cxxbridge-macro",
"foldhash 0.2.0",
"link-cplusplus",
]
[[package]]
name = "cxx-build"
version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a9bc1a22964ff6a355fbec24cf68266a0ed28f8b84c0864c386474ea3d0e479"
dependencies = [
"cc",
"codespan-reporting 0.13.1",
"indexmap 2.11.4",
"proc-macro2",
"quote",
"scratch",
"syn 2.0.106",
]
[[package]]
name = "cxxbridge-cmd"
version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f29a879d35f7906e3c9b77d7a1005a6a0787d330c09dfe4ffb5f617728cb44"
dependencies = [
"clap 4.5.40",
"codespan-reporting 0.13.1",
"indexmap 2.11.4",
"proc-macro2",
"quote",
"syn 2.0.106",
]
[[package]]
name = "cxxbridge-flags"
version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d67109015f93f683e364085aa6489a5b2118b4a40058482101d699936a7836d6"
[[package]]
name = "cxxbridge-macro"
version = "1.0.190"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d187e019e7b05a1f3e69a8396b70800ee867aa9fc2ab972761173ccee03742df"
dependencies = [
"indexmap 2.11.4",
"proc-macro2",
"quote",
"syn 2.0.106",
]
[[package]]
name = "darling"
version = "0.14.4"
@@ -3939,7 +4012,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arrow-flight",
@@ -4003,7 +4076,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"arrow",
"arrow-array",
@@ -4677,7 +4750,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "file-engine"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -4809,7 +4882,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
[[package]]
name = "flow"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arrow",
@@ -4878,7 +4951,7 @@ dependencies = [
"sql",
"store-api",
"strum 0.27.1",
"substrait 1.0.0-beta.2",
"substrait 1.0.0-beta.3",
"table",
"tokio",
"tonic 0.13.1",
@@ -4916,6 +4989,12 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foldhash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "form_urlencoded"
version = "1.2.2"
@@ -4933,7 +5012,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"
[[package]]
name = "frontend"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arc-swap",
@@ -5516,7 +5595,7 @@ checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
"foldhash 0.1.5",
]
[[package]]
@@ -6148,7 +6227,7 @@ dependencies = [
[[package]]
name = "index"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -6161,6 +6240,7 @@ dependencies = [
"common-telemetry",
"common-test-util",
"criterion 0.4.0",
"datatypes",
"fastbloom",
"fst",
"futures",
@@ -6169,6 +6249,7 @@ dependencies = [
"jieba-rs",
"lazy_static",
"mockall",
"nalgebra",
"pin-project",
"prost 0.13.5",
"puffin",
@@ -6186,6 +6267,7 @@ dependencies = [
"tempfile",
"tokio",
"tokio-util",
"usearch",
"uuid",
]
@@ -7017,6 +7099,15 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "link-cplusplus"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82"
dependencies = [
"cc",
]
[[package]]
name = "linked-hash-map"
version = "0.5.6"
@@ -7077,7 +7168,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "log-query"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"chrono",
"common-error",
@@ -7089,7 +7180,7 @@ dependencies = [
[[package]]
name = "log-store"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-stream",
"async-trait",
@@ -7390,7 +7481,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -7418,7 +7509,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -7518,7 +7609,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"aquamarine",
@@ -7615,7 +7706,7 @@ dependencies = [
[[package]]
name = "mito-codec"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"bytes",
@@ -7640,7 +7731,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"aquamarine",
@@ -8380,7 +8471,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"anyhow",
"bytes",
@@ -8665,7 +8756,7 @@ dependencies = [
[[package]]
name = "operator"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"api",
@@ -8725,7 +8816,7 @@ dependencies = [
"sql",
"sqlparser",
"store-api",
"substrait 1.0.0-beta.2",
"substrait 1.0.0-beta.3",
"table",
"tokio",
"tokio-util",
@@ -9011,7 +9102,7 @@ dependencies = [
[[package]]
name = "partition"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -9368,7 +9459,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pipeline"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"api",
@@ -9524,7 +9615,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"auth",
"catalog",
@@ -9826,7 +9917,7 @@ dependencies = [
[[package]]
name = "promql"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"async-trait",
@@ -10109,7 +10200,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-compression 0.4.19",
"async-trait",
@@ -10151,7 +10242,7 @@ dependencies = [
[[package]]
name = "query"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"api",
@@ -10218,7 +10309,7 @@ dependencies = [
"sql",
"sqlparser",
"store-api",
"substrait 1.0.0-beta.2",
"substrait 1.0.0-beta.3",
"table",
"tokio",
"tokio-stream",
@@ -11290,6 +11381,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scratch"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2"
[[package]]
name = "scrypt"
version = "0.11.0"
@@ -11554,7 +11651,7 @@ dependencies = [
[[package]]
name = "servers"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"api",
@@ -11682,7 +11779,7 @@ dependencies = [
[[package]]
name = "session"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"ahash 0.8.12",
"api",
@@ -12016,7 +12113,7 @@ dependencies = [
[[package]]
name = "sql"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arrow-buffer",
@@ -12076,7 +12173,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"clap 4.5.40",
@@ -12353,7 +12450,7 @@ dependencies = [
[[package]]
name = "standalone"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"catalog",
@@ -12394,7 +12491,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "store-api"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"aquamarine",
@@ -12607,7 +12704,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"async-trait",
"bytes",
@@ -12730,7 +12827,7 @@ dependencies = [
[[package]]
name = "table"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"async-trait",
@@ -12999,7 +13096,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
[[package]]
name = "tests-fuzz"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"arbitrary",
"async-trait",
@@ -13043,7 +13140,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
dependencies = [
"api",
"arrow-flight",
@@ -13118,7 +13215,7 @@ dependencies = [
"sqlx",
"standalone",
"store-api",
"substrait 1.0.0-beta.2",
"substrait 1.0.0-beta.3",
"table",
"tempfile",
"time",
@@ -14143,6 +14240,16 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "usearch"
version = "2.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cc9fc5f872a3a4f9081d5f42624d788231b763e1846c829b9968a3755ac884d"
dependencies = [
"cxx",
"cxx-build",
]
[[package]]
name = "utf8-ranges"
version = "1.0.5"
@@ -14282,7 +14389,7 @@ dependencies = [
"ciborium",
"cidr",
"clap 4.5.40",
"codespan-reporting",
"codespan-reporting 0.12.0",
"community-id",
"convert_case 0.7.1",
"crc",

View File

@@ -75,7 +75,7 @@ members = [
resolver = "2"
[workspace.package]
version = "1.0.0-beta.2"
version = "1.0.0-beta.3"
edition = "2024"
license = "Apache-2.0"

View File

@@ -428,7 +428,7 @@ pub trait InformationExtension {
}
/// The request to inspect the datanode.
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq)]
pub struct DatanodeInspectRequest {
/// Kind to fetch from datanode.
pub kind: DatanodeInspectKind,

View File

@@ -33,7 +33,8 @@ pub use crate::schema::column_schema::{
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, VECTOR_INDEX_KEY,
VectorDistanceMetric, VectorIndexEngineType, VectorIndexOptions,
};
pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema;

View File

@@ -46,6 +46,8 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
/// Key used to store skip options in arrow field's metadata.
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
/// Key used to store vector index options in arrow field's metadata.
pub const VECTOR_INDEX_KEY: &str = "greptime:vector_index";
/// Keys used in fulltext options
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -216,6 +218,53 @@ impl ColumnSchema {
self.metadata.contains_key(INVERTED_INDEX_KEY)
}
/// Checks if this column has a vector index.
pub fn is_vector_indexed(&self) -> bool {
match self.vector_index_options() {
Ok(opts) => opts.is_some(),
Err(e) => {
common_telemetry::warn!(
"Failed to deserialize vector_index_options for column '{}': {}",
self.name,
e
);
false
}
}
}
/// Gets the vector index options.
pub fn vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
match self.metadata.get(VECTOR_INDEX_KEY) {
None => Ok(None),
Some(json) => {
let options =
serde_json::from_str(json).context(error::DeserializeSnafu { json })?;
Ok(Some(options))
}
}
}
/// Sets the vector index options.
pub fn set_vector_index_options(&mut self, options: &VectorIndexOptions) -> Result<()> {
self.metadata.insert(
VECTOR_INDEX_KEY.to_string(),
serde_json::to_string(options).context(error::SerializeSnafu)?,
);
Ok(())
}
/// Removes the vector index options.
pub fn unset_vector_index_options(&mut self) {
self.metadata.remove(VECTOR_INDEX_KEY);
}
/// Sets vector index options and returns self for chaining.
pub fn with_vector_index_options(mut self, options: &VectorIndexOptions) -> Result<Self> {
self.set_vector_index_options(options)?;
Ok(self)
}
/// Set default constraint.
///
/// If a default constraint exists for the column, this method will
@@ -964,6 +1013,181 @@ impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
}
}
/// Distance metric for vector similarity search.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)]
#[serde(rename_all = "lowercase")]
pub enum VectorDistanceMetric {
/// Squared Euclidean distance (L2^2).
#[default]
L2sq,
/// Cosine distance (1 - cosine similarity).
Cosine,
/// Inner product (negative, for maximum inner product search).
#[serde(alias = "ip")]
InnerProduct,
}
impl fmt::Display for VectorDistanceMetric {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
VectorDistanceMetric::L2sq => write!(f, "l2sq"),
VectorDistanceMetric::Cosine => write!(f, "cosine"),
VectorDistanceMetric::InnerProduct => write!(f, "ip"),
}
}
}
impl std::str::FromStr for VectorDistanceMetric {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"l2sq" | "l2" | "euclidean" => Ok(VectorDistanceMetric::L2sq),
"cosine" | "cos" => Ok(VectorDistanceMetric::Cosine),
"inner_product" | "ip" | "dot" => Ok(VectorDistanceMetric::InnerProduct),
_ => Err(format!(
"Unknown distance metric: {}. Expected: l2sq, cosine, or ip",
s
)),
}
}
}
impl VectorDistanceMetric {
/// Returns the metric as u8 for blob serialization.
pub fn as_u8(&self) -> u8 {
match self {
Self::L2sq => 0,
Self::Cosine => 1,
Self::InnerProduct => 2,
}
}
/// Parses metric from u8 (used when reading blob).
pub fn try_from_u8(v: u8) -> Option<Self> {
match v {
0 => Some(Self::L2sq),
1 => Some(Self::Cosine),
2 => Some(Self::InnerProduct),
_ => None,
}
}
}
/// Default HNSW connectivity parameter.
const DEFAULT_VECTOR_INDEX_CONNECTIVITY: u32 = 16;
/// Default expansion factor during index construction.
const DEFAULT_VECTOR_INDEX_EXPANSION_ADD: u32 = 128;
/// Default expansion factor during search.
const DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH: u32 = 64;
fn default_vector_index_connectivity() -> u32 {
DEFAULT_VECTOR_INDEX_CONNECTIVITY
}
fn default_vector_index_expansion_add() -> u32 {
DEFAULT_VECTOR_INDEX_EXPANSION_ADD
}
fn default_vector_index_expansion_search() -> u32 {
DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH
}
/// Supported vector index engine types.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Visit, VisitMut)]
#[serde(rename_all = "lowercase")]
pub enum VectorIndexEngineType {
/// USearch HNSW implementation.
#[default]
Usearch,
// Future: Vsag,
}
impl VectorIndexEngineType {
/// Returns the engine type as u8 for blob serialization.
pub fn as_u8(&self) -> u8 {
match self {
Self::Usearch => 0,
}
}
/// Parses engine type from u8 (used when reading blob).
pub fn try_from_u8(v: u8) -> Option<Self> {
match v {
0 => Some(Self::Usearch),
_ => None,
}
}
}
impl fmt::Display for VectorIndexEngineType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Usearch => write!(f, "usearch"),
}
}
}
impl std::str::FromStr for VectorIndexEngineType {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"usearch" => Ok(Self::Usearch),
_ => Err(format!(
"Unknown vector index engine: {}. Expected: usearch",
s
)),
}
}
}
/// Options for vector index (HNSW).
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
#[serde(rename_all = "kebab-case")]
pub struct VectorIndexOptions {
/// Vector index engine type (default: usearch).
#[serde(default)]
pub engine: VectorIndexEngineType,
/// Distance metric for similarity search.
#[serde(default)]
pub metric: VectorDistanceMetric,
/// HNSW connectivity parameter (M in the paper).
/// Higher values improve recall but increase memory usage.
#[serde(default = "default_vector_index_connectivity")]
pub connectivity: u32,
/// Expansion factor during index construction (ef_construction).
/// Higher values improve index quality but slow down construction.
#[serde(default = "default_vector_index_expansion_add")]
pub expansion_add: u32,
/// Expansion factor during search (ef_search).
/// Higher values improve recall but slow down search.
#[serde(default = "default_vector_index_expansion_search")]
pub expansion_search: u32,
}
impl Default for VectorIndexOptions {
fn default() -> Self {
Self {
engine: VectorIndexEngineType::default(),
metric: VectorDistanceMetric::default(),
connectivity: DEFAULT_VECTOR_INDEX_CONNECTIVITY,
expansion_add: DEFAULT_VECTOR_INDEX_EXPANSION_ADD,
expansion_search: DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH,
}
}
}
impl fmt::Display for VectorIndexOptions {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"engine={}, metric={}, connectivity={}, expansion_add={}, expansion_search={}",
self.engine, self.metric, self.connectivity, self.expansion_add, self.expansion_search
)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;

View File

@@ -7,6 +7,9 @@ license.workspace = true
[lints]
workspace = true
[features]
vector_index = ["dep:usearch"]
[dependencies]
async-trait.workspace = true
asynchronous-codec = "0.7.0"
@@ -17,6 +20,7 @@ common-error.workspace = true
common-macro.workspace = true
common-runtime.workspace = true
common-telemetry.workspace = true
datatypes.workspace = true
fastbloom = "0.8"
fst.workspace = true
futures.workspace = true
@@ -25,6 +29,7 @@ itertools.workspace = true
jieba-rs = "0.8"
lazy_static.workspace = true
mockall.workspace = true
nalgebra.workspace = true
pin-project.workspace = true
prost.workspace = true
puffin.workspace = true
@@ -39,6 +44,7 @@ tantivy = { version = "0.24", features = ["zstd-compression"] }
tantivy-jieba = "0.16"
tokio.workspace = true
tokio-util.workspace = true
usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
uuid.workspace = true
[dev-dependencies]

View File

@@ -22,6 +22,8 @@ pub mod external_provider;
pub mod fulltext_index;
pub mod inverted_index;
pub mod target;
#[cfg(feature = "vector_index")]
pub mod vector;
pub type Bytes = Vec<u8>;
pub type BytesRef<'a> = &'a [u8];

163
src/index/src/vector.rs Normal file
View File

@@ -0,0 +1,163 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Vector index types and options.
//!
//! This module re-exports types from `datatypes` and provides conversions
//! to USearch types, as well as distance computation functions.
pub use datatypes::schema::{VectorDistanceMetric, VectorIndexOptions};
use nalgebra::DVectorView;
pub use usearch::MetricKind;
/// Converts a VectorDistanceMetric to a USearch MetricKind.
pub fn distance_metric_to_usearch(metric: VectorDistanceMetric) -> MetricKind {
match metric {
VectorDistanceMetric::L2sq => MetricKind::L2sq,
VectorDistanceMetric::Cosine => MetricKind::Cos,
VectorDistanceMetric::InnerProduct => MetricKind::IP,
}
}
/// Computes distance between two vectors using the specified metric.
///
/// Uses SIMD-optimized implementations via nalgebra.
///
/// **Note:** The caller must ensure that the two vectors have the same length
/// and are non-empty. Empty vectors return 0.0 for all metrics.
pub fn compute_distance(v1: &[f32], v2: &[f32], metric: VectorDistanceMetric) -> f32 {
// Empty vectors are degenerate; return 0.0 uniformly across all metrics.
if v1.is_empty() || v2.is_empty() {
return 0.0;
}
match metric {
VectorDistanceMetric::L2sq => l2sq(v1, v2),
VectorDistanceMetric::Cosine => cosine(v1, v2),
VectorDistanceMetric::InnerProduct => -dot(v1, v2),
}
}
/// Calculates the squared L2 distance between two vectors.
fn l2sq(lhs: &[f32], rhs: &[f32]) -> f32 {
let lhs = DVectorView::from_slice(lhs, lhs.len());
let rhs = DVectorView::from_slice(rhs, rhs.len());
(lhs - rhs).norm_squared()
}
/// Calculates the cosine distance between two vectors.
///
/// Returns a value in `[0.0, 2.0]` where 0.0 means identical direction and 2.0 means
/// opposite direction. For degenerate cases (zero or near-zero magnitude vectors),
/// returns 1.0 (maximum uncertainty) to avoid NaN and ensure safe index operations.
fn cosine(lhs: &[f32], rhs: &[f32]) -> f32 {
let lhs_vec = DVectorView::from_slice(lhs, lhs.len());
let rhs_vec = DVectorView::from_slice(rhs, rhs.len());
let dot_product = lhs_vec.dot(&rhs_vec);
let lhs_norm = lhs_vec.norm();
let rhs_norm = rhs_vec.norm();
// Zero-magnitude vectors have undefined direction; return max distance as safe fallback.
if dot_product.abs() < f32::EPSILON
|| lhs_norm.abs() < f32::EPSILON
|| rhs_norm.abs() < f32::EPSILON
{
return 1.0;
}
let cos_similar = dot_product / (lhs_norm * rhs_norm);
let res = 1.0 - cos_similar;
// Clamp near-zero results to exactly 0.0 to avoid floating-point artifacts.
if res.abs() < f32::EPSILON { 0.0 } else { res }
}
/// Calculates the dot product between two vectors.
fn dot(lhs: &[f32], rhs: &[f32]) -> f32 {
let lhs = DVectorView::from_slice(lhs, lhs.len());
let rhs = DVectorView::from_slice(rhs, rhs.len());
lhs.dot(&rhs)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_distance_metric_to_usearch() {
assert_eq!(
distance_metric_to_usearch(VectorDistanceMetric::L2sq),
MetricKind::L2sq
);
assert_eq!(
distance_metric_to_usearch(VectorDistanceMetric::Cosine),
MetricKind::Cos
);
assert_eq!(
distance_metric_to_usearch(VectorDistanceMetric::InnerProduct),
MetricKind::IP
);
}
#[test]
fn test_vector_index_options_default() {
let options = VectorIndexOptions::default();
assert_eq!(options.metric, VectorDistanceMetric::L2sq);
assert_eq!(options.connectivity, 16);
assert_eq!(options.expansion_add, 128);
assert_eq!(options.expansion_search, 64);
}
#[test]
fn test_compute_distance_l2sq() {
let v1 = vec![1.0, 2.0, 3.0];
let v2 = vec![4.0, 5.0, 6.0];
// L2sq = (4-1)^2 + (5-2)^2 + (6-3)^2 = 9 + 9 + 9 = 27
let dist = compute_distance(&v1, &v2, VectorDistanceMetric::L2sq);
assert!((dist - 27.0).abs() < 1e-6);
}
#[test]
fn test_compute_distance_cosine() {
let v1 = vec![1.0, 0.0, 0.0];
let v2 = vec![0.0, 1.0, 0.0];
// Orthogonal vectors have cosine similarity of 0, distance of 1
let dist = compute_distance(&v1, &v2, VectorDistanceMetric::Cosine);
assert!((dist - 1.0).abs() < 1e-6);
}
#[test]
fn test_compute_distance_inner_product() {
let v1 = vec![1.0, 2.0, 3.0];
let v2 = vec![4.0, 5.0, 6.0];
// Inner product = 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
// Distance is negated: -32
let dist = compute_distance(&v1, &v2, VectorDistanceMetric::InnerProduct);
assert!((dist - (-32.0)).abs() < 1e-6);
}
#[test]
fn test_compute_distance_empty_vectors() {
// Empty vectors should return 0.0 uniformly for all metrics
assert_eq!(compute_distance(&[], &[], VectorDistanceMetric::L2sq), 0.0);
assert_eq!(
compute_distance(&[], &[], VectorDistanceMetric::Cosine),
0.0
);
assert_eq!(
compute_distance(&[], &[], VectorDistanceMetric::InnerProduct),
0.0
);
}
}

View File

@@ -774,7 +774,12 @@ fn memtable_flat_sources(
let iter = only_range.build_record_batch_iter(None)?;
// Dedup according to append mode and merge mode.
// Even single range may have duplicate rows.
let iter = maybe_dedup_one(options, field_column_start, iter);
let iter = maybe_dedup_one(
options.append_mode,
options.merge_mode(),
field_column_start,
iter,
);
flat_sources.sources.push(FlatSource::Iter(iter));
};
} else {
@@ -842,17 +847,18 @@ fn merge_and_dedup(
Ok(maybe_dedup)
}
fn maybe_dedup_one(
options: &RegionOptions,
pub fn maybe_dedup_one(
append_mode: bool,
merge_mode: MergeMode,
field_column_start: usize,
input_iter: BoxedRecordBatchIterator,
) -> BoxedRecordBatchIterator {
if options.append_mode {
if append_mode {
// No dedup in append mode
input_iter
} else {
// Dedup according to merge mode.
match options.merge_mode() {
match merge_mode {
MergeMode::LastRow => {
Box::new(FlatDedupIterator::new(input_iter, FlatLastRow::new(false)))
}

View File

@@ -55,10 +55,8 @@ pub mod time_partition;
pub mod time_series;
pub(crate) mod version;
#[cfg(any(test, feature = "test"))]
pub use bulk::part::BulkPart;
pub use bulk::part::{
BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
sort_primary_key_record_batch,
};
#[cfg(any(test, feature = "test"))]

View File

@@ -668,10 +668,10 @@ impl BulkMemtable {
}
/// Iterator builder for bulk range
struct BulkRangeIterBuilder {
part: BulkPart,
context: Arc<BulkIterContext>,
sequence: Option<SequenceRange>,
pub struct BulkRangeIterBuilder {
pub part: BulkPart,
pub context: Arc<BulkIterContext>,
pub sequence: Option<SequenceRange>,
}
impl IterBuilder for BulkRangeIterBuilder {
@@ -1188,7 +1188,6 @@ impl MemtableBuilder for BulkMemtableBuilder {
#[cfg(test)]
mod tests {
use mito_codec::row_converter::build_primary_key_codec;
use super::*;

View File

@@ -1211,343 +1211,24 @@ impl BulkPartEncoder {
}
}
/// Converts mutations to record batches.
fn mutations_to_record_batch(
mutations: &[Mutation],
metadata: &RegionMetadataRef,
pk_encoder: &DensePrimaryKeyCodec,
dedup: bool,
) -> Result<Option<(RecordBatch, i64, i64)>> {
let total_rows: usize = mutations
.iter()
.map(|m| m.rows.as_ref().map(|r| r.rows.len()).unwrap_or(0))
.sum();
if total_rows == 0 {
return Ok(None);
}
let mut pk_builder = BinaryBuilder::with_capacity(total_rows, 0);
let mut ts_vector: Box<dyn MutableVector> = metadata
.time_index_column()
.column_schema
.data_type
.create_mutable_vector(total_rows);
let mut sequence_builder = UInt64Builder::with_capacity(total_rows);
let mut op_type_builder = UInt8Builder::with_capacity(total_rows);
let mut field_builders: Vec<Box<dyn MutableVector>> = metadata
.field_columns()
.map(|f| f.column_schema.data_type.create_mutable_vector(total_rows))
.collect();
let mut pk_buffer = vec![];
for m in mutations {
let Some(key_values) = KeyValuesRef::new(metadata, m) else {
continue;
};
for row in key_values.iter() {
pk_buffer.clear();
pk_encoder
.encode_to_vec(row.primary_keys(), &mut pk_buffer)
.context(EncodeSnafu)?;
pk_builder.append_value(pk_buffer.as_bytes());
ts_vector.push_value_ref(&row.timestamp());
sequence_builder.append_value(row.sequence());
op_type_builder.append_value(row.op_type() as u8);
for (builder, field) in field_builders.iter_mut().zip(row.fields()) {
builder.push_value_ref(&field);
}
}
}
let arrow_schema = to_sst_arrow_schema(metadata);
// safety: timestamp column must be valid, and values must not be None.
let timestamp_unit = metadata
.time_index_column()
.column_schema
.data_type
.as_timestamp()
.unwrap()
.unit();
let sorter = ArraysSorter {
encoded_primary_keys: pk_builder.finish(),
timestamp_unit,
timestamp: ts_vector.to_vector().to_arrow_array(),
sequence: sequence_builder.finish(),
op_type: op_type_builder.finish(),
fields: field_builders
.iter_mut()
.map(|f| f.to_vector().to_arrow_array()),
dedup,
arrow_schema,
};
sorter.sort().map(Some)
}
struct ArraysSorter<I> {
encoded_primary_keys: BinaryArray,
timestamp_unit: TimeUnit,
timestamp: ArrayRef,
sequence: UInt64Array,
op_type: UInt8Array,
fields: I,
dedup: bool,
arrow_schema: SchemaRef,
}
impl<I> ArraysSorter<I>
where
I: Iterator<Item = ArrayRef>,
{
/// Converts arrays to record batch.
fn sort(self) -> Result<(RecordBatch, i64, i64)> {
debug_assert!(!self.timestamp.is_empty());
debug_assert!(self.timestamp.len() == self.sequence.len());
debug_assert!(self.timestamp.len() == self.op_type.len());
debug_assert!(self.timestamp.len() == self.encoded_primary_keys.len());
let timestamp_iter = timestamp_array_to_iter(self.timestamp_unit, &self.timestamp);
let (mut min_timestamp, mut max_timestamp) = (i64::MAX, i64::MIN);
let mut to_sort = self
.encoded_primary_keys
.iter()
.zip(timestamp_iter)
.zip(self.sequence.iter())
.map(|((pk, timestamp), sequence)| {
max_timestamp = max_timestamp.max(*timestamp);
min_timestamp = min_timestamp.min(*timestamp);
(pk, timestamp, sequence)
})
.enumerate()
.collect::<Vec<_>>();
to_sort.sort_unstable_by(|(_, (l_pk, l_ts, l_seq)), (_, (r_pk, r_ts, r_seq))| {
l_pk.cmp(r_pk)
.then(l_ts.cmp(r_ts))
.then(l_seq.cmp(r_seq).reverse())
});
if self.dedup {
// Dedup by timestamps while ignore sequence.
to_sort.dedup_by(|(_, (l_pk, l_ts, _)), (_, (r_pk, r_ts, _))| {
l_pk == r_pk && l_ts == r_ts
});
}
let indices = UInt32Array::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
let pk_dictionary = Arc::new(binary_array_to_dictionary(
// safety: pk must be BinaryArray
arrow::compute::take(
&self.encoded_primary_keys,
&indices,
Some(TakeOptions {
check_bounds: false,
}),
)
.context(ComputeArrowSnafu)?
.as_any()
.downcast_ref::<BinaryArray>()
.unwrap(),
)?) as ArrayRef;
let mut arrays = Vec::with_capacity(self.arrow_schema.fields.len());
for arr in self.fields {
arrays.push(
arrow::compute::take(
&arr,
&indices,
Some(TakeOptions {
check_bounds: false,
}),
)
.context(ComputeArrowSnafu)?,
);
}
let timestamp = arrow::compute::take(
&self.timestamp,
&indices,
Some(TakeOptions {
check_bounds: false,
}),
)
.context(ComputeArrowSnafu)?;
arrays.push(timestamp);
arrays.push(pk_dictionary);
arrays.push(
arrow::compute::take(
&self.sequence,
&indices,
Some(TakeOptions {
check_bounds: false,
}),
)
.context(ComputeArrowSnafu)?,
);
arrays.push(
arrow::compute::take(
&self.op_type,
&indices,
Some(TakeOptions {
check_bounds: false,
}),
)
.context(ComputeArrowSnafu)?,
);
let batch = RecordBatch::try_new(self.arrow_schema, arrays).context(NewRecordBatchSnafu)?;
Ok((batch, min_timestamp, max_timestamp))
}
}
/// Converts timestamp array to an iter of i64 values.
fn timestamp_array_to_iter(
timestamp_unit: TimeUnit,
timestamp: &ArrayRef,
) -> impl Iterator<Item = &i64> {
match timestamp_unit {
// safety: timestamp column must be valid.
TimeUnit::Second => timestamp
.as_any()
.downcast_ref::<TimestampSecondArray>()
.unwrap()
.values()
.iter(),
TimeUnit::Millisecond => timestamp
.as_any()
.downcast_ref::<TimestampMillisecondArray>()
.unwrap()
.values()
.iter(),
TimeUnit::Microsecond => timestamp
.as_any()
.downcast_ref::<TimestampMicrosecondArray>()
.unwrap()
.values()
.iter(),
TimeUnit::Nanosecond => timestamp
.as_any()
.downcast_ref::<TimestampNanosecondArray>()
.unwrap()
.values()
.iter(),
}
}
/// Converts a **sorted** [BinaryArray] to [DictionaryArray].
fn binary_array_to_dictionary(input: &BinaryArray) -> Result<PrimaryKeyArray> {
if input.is_empty() {
return Ok(DictionaryArray::new(
UInt32Array::from(Vec::<u32>::new()),
Arc::new(BinaryArray::from_vec(vec![])) as ArrayRef,
));
}
let mut keys = Vec::with_capacity(16);
let mut values = BinaryBuilder::new();
let mut prev: usize = 0;
keys.push(prev as u32);
values.append_value(input.value(prev));
for current_bytes in input.iter().skip(1) {
// safety: encoded pk must present.
let current_bytes = current_bytes.unwrap();
let prev_bytes = input.value(prev);
if current_bytes != prev_bytes {
values.append_value(current_bytes);
prev += 1;
}
keys.push(prev as u32);
}
Ok(DictionaryArray::new(
UInt32Array::from(keys),
Arc::new(values.finish()) as ArrayRef,
))
}
#[cfg(test)]
mod tests {
use std::collections::VecDeque;
use api::v1::{Row, SemanticType, WriteHint};
use datafusion_common::ScalarValue;
use datatypes::arrow::array::Float64Array;
use datatypes::prelude::{ConcreteDataType, ScalarVector, Value};
use datatypes::prelude::{ConcreteDataType, Value};
use datatypes::schema::ColumnSchema;
use datatypes::vectors::{Float64Vector, TimestampMillisecondVector};
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
use store_api::storage::RegionId;
use store_api::storage::consts::ReservedColumnId;
use super::*;
use crate::memtable::bulk::context::BulkIterContext;
use crate::sst::parquet::format::{PrimaryKeyReadFormat, ReadFormat};
use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
use crate::test_util::memtable_util::{
build_key_values_with_ts_seq_values, metadata_for_test, region_metadata_to_row_schema,
};
fn check_binary_array_to_dictionary(
input: &[&[u8]],
expected_keys: &[u32],
expected_values: &[&[u8]],
) {
let input = BinaryArray::from_iter_values(input.iter());
let array = binary_array_to_dictionary(&input).unwrap();
assert_eq!(
&expected_keys,
&array.keys().iter().map(|v| v.unwrap()).collect::<Vec<_>>()
);
assert_eq!(
expected_values,
&array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.unwrap()
.iter()
.map(|v| v.unwrap())
.collect::<Vec<_>>()
);
}
#[test]
fn test_binary_array_to_dictionary() {
check_binary_array_to_dictionary(&[], &[], &[]);
check_binary_array_to_dictionary(&["a".as_bytes()], &[0], &["a".as_bytes()]);
check_binary_array_to_dictionary(
&["a".as_bytes(), "a".as_bytes()],
&[0, 0],
&["a".as_bytes()],
);
check_binary_array_to_dictionary(
&["a".as_bytes(), "a".as_bytes(), "b".as_bytes()],
&[0, 0, 1],
&["a".as_bytes(), "b".as_bytes()],
);
check_binary_array_to_dictionary(
&[
"a".as_bytes(),
"a".as_bytes(),
"b".as_bytes(),
"c".as_bytes(),
],
&[0, 0, 1, 2],
&["a".as_bytes(), "b".as_bytes(), "c".as_bytes()],
);
}
struct MutationInput<'a> {
k0: &'a str,
k1: u32,
@@ -1563,232 +1244,6 @@ mod tests {
v1: &'a [Option<f64>],
}
fn check_mutations_to_record_batches(
input: &[MutationInput],
expected: &[BatchOutput],
expected_timestamp: (i64, i64),
dedup: bool,
) {
let metadata = metadata_for_test();
let mutations = input
.iter()
.map(|m| {
build_key_values_with_ts_seq_values(
&metadata,
m.k0.to_string(),
m.k1,
m.timestamps.iter().copied(),
m.v1.iter().copied(),
m.sequence,
)
.mutation
})
.collect::<Vec<_>>();
let total_rows: usize = mutations
.iter()
.flat_map(|m| m.rows.iter())
.map(|r| r.rows.len())
.sum();
let pk_encoder = DensePrimaryKeyCodec::new(&metadata);
let (batch, _, _) = mutations_to_record_batch(&mutations, &metadata, &pk_encoder, dedup)
.unwrap()
.unwrap();
let read_format = PrimaryKeyReadFormat::new_with_all_columns(metadata.clone());
let mut batches = VecDeque::new();
read_format
.convert_record_batch(&batch, None, &mut batches)
.unwrap();
if !dedup {
assert_eq!(
total_rows,
batches.iter().map(|b| { b.num_rows() }).sum::<usize>()
);
}
let batch_values = batches
.into_iter()
.map(|b| {
let pk_values = pk_encoder.decode(b.primary_key()).unwrap().into_dense();
let timestamps = b
.timestamps()
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.map(|v| v.unwrap().0.value())
.collect::<Vec<_>>();
let float_values = b.fields()[1]
.data
.as_any()
.downcast_ref::<Float64Vector>()
.unwrap()
.iter_data()
.collect::<Vec<_>>();
(pk_values, timestamps, float_values)
})
.collect::<Vec<_>>();
assert_eq!(expected.len(), batch_values.len());
for idx in 0..expected.len() {
assert_eq!(expected[idx].pk_values, &batch_values[idx].0);
assert_eq!(expected[idx].timestamps, &batch_values[idx].1);
assert_eq!(expected[idx].v1, &batch_values[idx].2);
}
}
#[test]
fn test_mutations_to_record_batch() {
check_mutations_to_record_batches(
&[MutationInput {
k0: "a",
k1: 0,
timestamps: &[0],
v1: &[Some(0.1)],
sequence: 0,
}],
&[BatchOutput {
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
timestamps: &[0],
v1: &[Some(0.1)],
}],
(0, 0),
true,
);
check_mutations_to_record_batches(
&[
MutationInput {
k0: "a",
k1: 0,
timestamps: &[0],
v1: &[Some(0.1)],
sequence: 0,
},
MutationInput {
k0: "b",
k1: 0,
timestamps: &[0],
v1: &[Some(0.0)],
sequence: 0,
},
MutationInput {
k0: "a",
k1: 0,
timestamps: &[1],
v1: &[Some(0.2)],
sequence: 1,
},
MutationInput {
k0: "a",
k1: 1,
timestamps: &[1],
v1: &[Some(0.3)],
sequence: 2,
},
],
&[
BatchOutput {
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
timestamps: &[0, 1],
v1: &[Some(0.1), Some(0.2)],
},
BatchOutput {
pk_values: &[Value::String("a".into()), Value::UInt32(1)],
timestamps: &[1],
v1: &[Some(0.3)],
},
BatchOutput {
pk_values: &[Value::String("b".into()), Value::UInt32(0)],
timestamps: &[0],
v1: &[Some(0.0)],
},
],
(0, 1),
true,
);
check_mutations_to_record_batches(
&[
MutationInput {
k0: "a",
k1: 0,
timestamps: &[0],
v1: &[Some(0.1)],
sequence: 0,
},
MutationInput {
k0: "b",
k1: 0,
timestamps: &[0],
v1: &[Some(0.0)],
sequence: 0,
},
MutationInput {
k0: "a",
k1: 0,
timestamps: &[0],
v1: &[Some(0.2)],
sequence: 1,
},
],
&[
BatchOutput {
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
timestamps: &[0],
v1: &[Some(0.2)],
},
BatchOutput {
pk_values: &[Value::String("b".into()), Value::UInt32(0)],
timestamps: &[0],
v1: &[Some(0.0)],
},
],
(0, 0),
true,
);
check_mutations_to_record_batches(
&[
MutationInput {
k0: "a",
k1: 0,
timestamps: &[0],
v1: &[Some(0.1)],
sequence: 0,
},
MutationInput {
k0: "b",
k1: 0,
timestamps: &[0],
v1: &[Some(0.0)],
sequence: 0,
},
MutationInput {
k0: "a",
k1: 0,
timestamps: &[0],
v1: &[Some(0.2)],
sequence: 1,
},
],
&[
BatchOutput {
pk_values: &[Value::String("a".into()), Value::UInt32(0)],
timestamps: &[0, 0],
v1: &[Some(0.2), Some(0.1)],
},
BatchOutput {
pk_values: &[Value::String("b".into()), Value::UInt32(0)],
timestamps: &[0],
v1: &[Some(0.0)],
},
],
(0, 0),
false,
);
}
fn encode(input: &[MutationInput]) -> EncodedBulkPart {
let metadata = metadata_for_test();
let kvs = input

View File

@@ -121,7 +121,7 @@ impl FlatSchemaOptions {
///
/// The schema is:
/// ```text
/// primary key columns, field columns, time index, __prmary_key, __sequence, __op_type
/// primary key columns, field columns, time index, __primary_key, __sequence, __op_type
/// ```
///
/// # Panics

View File

@@ -16,6 +16,7 @@ use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_base::secrets::{ExposeSecret, SecretString};
use common_telemetry::tracing::warn;
use opendal::services::{Azblob, Gcs, Oss, S3};
use serde::{Deserialize, Serialize};
@@ -123,11 +124,18 @@ impl From<&S3Connection> for S3 {
fn from(connection: &S3Connection) -> Self {
let root = util::normalize_dir(&connection.root);
let mut builder = S3::default()
.root(&root)
.bucket(&connection.bucket)
.access_key_id(connection.access_key_id.expose_secret())
.secret_access_key(connection.secret_access_key.expose_secret());
let mut builder = S3::default().root(&root).bucket(&connection.bucket);
if !connection.access_key_id.expose_secret().is_empty()
&& !connection.secret_access_key.expose_secret().is_empty()
{
builder = builder
.access_key_id(connection.access_key_id.expose_secret())
.secret_access_key(connection.secret_access_key.expose_secret());
} else {
warn!("No access key id or secret access key provided, using anonymous access");
builder = builder.allow_anonymous().disable_ec2_metadata();
}
if let Some(endpoint) = &connection.endpoint {
builder = builder.endpoint(endpoint);

View File

@@ -285,6 +285,13 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to set VECTOR index option"))]
SetVectorIndexOption {
source: datatypes::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Invalid partition number: {}, should be in range [2, 65536]",
partition_num
@@ -394,7 +401,9 @@ impl ErrorExt for Error {
ConvertValue { .. } => StatusCode::Unsupported,
PermissionDenied { .. } => StatusCode::PermissionDenied,
SetFulltextOption { .. } | SetSkippingIndexOption { .. } => StatusCode::Unexpected,
SetFulltextOption { .. }
| SetSkippingIndexOption { .. }
| SetVectorIndexOption { .. } => StatusCode::Unexpected,
}
}

View File

@@ -43,6 +43,7 @@ use crate::parser::{FLOW, ParserContext};
use crate::parsers::tql_parser;
use crate::parsers::utils::{
self, validate_column_fulltext_create_option, validate_column_skipping_index_create_option,
validate_column_vector_index_create_option,
};
use crate::statements::create::{
Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable,
@@ -60,6 +61,7 @@ pub const EXPIRE: &str = "EXPIRE";
pub const AFTER: &str = "AFTER";
pub const INVERTED: &str = "INVERTED";
pub const SKIPPING: &str = "SKIPPING";
pub const VECTOR: &str = "VECTOR";
pub type RawIntervalExpr = String;
@@ -928,6 +930,61 @@ impl<'a> ParserContext<'a> {
is_index_declared |= true;
}
// vector index
if let Token::Word(word) = parser.peek_token().token
&& word.value.eq_ignore_ascii_case(VECTOR)
{
parser.next_token();
// Consume `INDEX` keyword
ensure!(
parser.parse_keyword(Keyword::INDEX),
InvalidColumnOptionSnafu {
name: column_name.to_string(),
msg: "expect INDEX after VECTOR keyword",
}
);
ensure!(
column_extensions.vector_index_options.is_none(),
InvalidColumnOptionSnafu {
name: column_name.to_string(),
msg: "duplicated VECTOR INDEX option",
}
);
// Check that column is a vector type
let column_type = get_unalias_type(column_type);
let data_type = sql_data_type_to_concrete_data_type(&column_type, column_extensions)?;
ensure!(
matches!(data_type, ConcreteDataType::Vector(_)),
InvalidColumnOptionSnafu {
name: column_name.to_string(),
msg: "VECTOR INDEX only supports Vector type columns",
}
);
let options = parser
.parse_options(Keyword::WITH)
.context(error::SyntaxSnafu)?
.into_iter()
.map(parse_option_string)
.collect::<Result<Vec<_>>>()?;
for (key, _) in options.iter() {
ensure!(
validate_column_vector_index_create_option(key),
InvalidColumnOptionSnafu {
name: column_name.to_string(),
msg: format!("invalid VECTOR INDEX option: {key}"),
}
);
}
let options = OptionMap::new(options);
column_extensions.vector_index_options = Some(options);
is_index_declared |= true;
}
Ok(is_index_declared)
}
@@ -2714,7 +2771,8 @@ CREATE TABLE log (
#[test]
fn test_parse_column_extensions_vector() {
let sql = "VECTOR(128)";
// Test that vector options are parsed from data_type (no additional SQL needed)
let sql = "";
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
@@ -2734,7 +2792,8 @@ CREATE TABLE log (
#[test]
fn test_parse_column_extensions_vector_invalid() {
let sql = "VECTOR()";
// Test that vector with no dimension fails
let sql = "";
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
@@ -2912,4 +2971,174 @@ CREATE TABLE log (
.unwrap();
assert_eq!("SELECT '10 seconds'::INTERVAL", &stmts[0].to_string());
}
#[test]
fn test_parse_create_table_vector_index_options() {
// Test basic vector index
let sql = r"
CREATE TABLE vectors (
ts TIMESTAMP TIME INDEX,
vec VECTOR(128) VECTOR INDEX,
)";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.unwrap();
if let Statement::CreateTable(c) = &result[0] {
c.columns.iter().for_each(|col| {
if col.name().value == "vec" {
assert!(
col.extensions
.vector_index_options
.as_ref()
.unwrap()
.is_empty()
);
}
});
} else {
panic!("should be create_table statement");
}
// Test vector index with options
let sql = r"
CREATE TABLE vectors (
ts TIMESTAMP TIME INDEX,
vec VECTOR(128) VECTOR INDEX WITH (metric='cosine', connectivity='32', expansion_add='256', expansion_search='128')
)";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.unwrap();
if let Statement::CreateTable(c) = &result[0] {
c.columns.iter().for_each(|col| {
if col.name().value == "vec" {
let options = col.extensions.vector_index_options.as_ref().unwrap();
assert_eq!(options.len(), 4);
assert_eq!(options.get("metric").unwrap(), "cosine");
assert_eq!(options.get("connectivity").unwrap(), "32");
assert_eq!(options.get("expansion_add").unwrap(), "256");
assert_eq!(options.get("expansion_search").unwrap(), "128");
}
});
} else {
panic!("should be create_table statement");
}
}
#[test]
fn test_parse_create_table_vector_index_invalid_type() {
// Test vector index on non-vector type (should fail)
let sql = r"
CREATE TABLE vectors (
ts TIMESTAMP TIME INDEX,
col INT VECTOR INDEX,
)";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("VECTOR INDEX only supports Vector type columns")
);
}
#[test]
fn test_parse_create_table_vector_index_duplicate() {
// Test duplicate vector index (should fail)
let sql = r"
CREATE TABLE vectors (
ts TIMESTAMP TIME INDEX,
vec VECTOR(128) VECTOR INDEX VECTOR INDEX,
)";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("duplicated VECTOR INDEX option")
);
}
#[test]
fn test_parse_create_table_vector_index_invalid_option() {
// Test invalid option key (should fail)
let sql = r"
CREATE TABLE vectors (
ts TIMESTAMP TIME INDEX,
vec VECTOR(128) VECTOR INDEX WITH (metric='l2sq', invalid_option='foo')
)";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("invalid VECTOR INDEX option")
);
}
#[test]
fn test_parse_column_extensions_vector_index() {
// Test vector index on vector type
{
let sql = "VECTOR INDEX WITH (metric = 'l2sq')";
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
let mut parser = Parser::new(&dialect).with_tokens(tokens);
let name = Ident::new("vec_col");
let data_type =
DataType::Custom(vec![Ident::new("VECTOR")].into(), vec!["128".to_string()]);
// First, parse the vector type to set vector_options
let mut extensions = ColumnExtensions {
vector_options: Some(OptionMap::from([(
VECTOR_OPT_DIM.to_string(),
"128".to_string(),
)])),
..Default::default()
};
let result = ParserContext::parse_column_extensions(
&mut parser,
&name,
&data_type,
&mut extensions,
);
assert!(result.is_ok());
assert!(extensions.vector_index_options.is_some());
let vi_options = extensions.vector_index_options.unwrap();
assert_eq!(vi_options.get("metric"), Some("l2sq"));
}
// Test vector index on non-vector type (should fail)
{
let sql = "VECTOR INDEX";
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize().unwrap();
let mut parser = Parser::new(&dialect).with_tokens(tokens);
let name = Ident::new("num_col");
let data_type = DataType::Int(None); // Non-vector type
let mut extensions = ColumnExtensions::default();
let result = ParserContext::parse_column_extensions(
&mut parser,
&name,
&data_type,
&mut extensions,
);
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("VECTOR INDEX only supports Vector type columns")
);
}
}
}

View File

@@ -222,6 +222,29 @@ pub fn validate_column_skipping_index_create_option(key: &str) -> bool {
.contains(&key)
}
/// Valid options for VECTOR INDEX:
/// - engine: Vector index engine (usearch)
/// - metric: Distance metric (l2sq, cosine, inner_product)
/// - connectivity: HNSW M parameter
/// - expansion_add: ef_construction parameter
/// - expansion_search: ef_search parameter
pub const COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE: &str = "engine";
pub const COLUMN_VECTOR_INDEX_OPT_KEY_METRIC: &str = "metric";
pub const COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY: &str = "connectivity";
pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD: &str = "expansion_add";
pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH: &str = "expansion_search";
pub fn validate_column_vector_index_create_option(key: &str) -> bool {
[
COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE,
COLUMN_VECTOR_INDEX_OPT_KEY_METRIC,
COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY,
COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD,
COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH,
]
.contains(&key)
}
/// Convert an [`IntervalMonthDayNano`] to a [`Duration`].
#[cfg(feature = "enterprise")]
pub fn convert_month_day_nano_to_duration(

View File

@@ -55,7 +55,7 @@ use crate::ast::{
use crate::error::{
self, ConvertToGrpcDataTypeSnafu, ConvertValueSnafu, Result,
SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetJsonStructureSettingsSnafu,
SetSkippingIndexOptionSnafu, SqlCommonSnafu,
SetSkippingIndexOptionSnafu, SetVectorIndexOptionSnafu, SqlCommonSnafu,
};
use crate::statements::create::{Column, ColumnExtensions};
pub use crate::statements::option_map::OptionMap;
@@ -147,6 +147,12 @@ pub fn column_to_schema(
.context(SetSkippingIndexOptionSnafu)?;
}
if let Some(options) = column.extensions.build_vector_index_options()? {
column_schema = column_schema
.with_vector_index_options(&options)
.context(SetVectorIndexOptionSnafu)?;
}
column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some());
if matches!(column.data_type(), SqlDataType::JSON) {
@@ -710,6 +716,7 @@ mod tests {
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: None,
},
};
@@ -720,4 +727,82 @@ mod tests {
assert_eq!(fulltext_options.analyzer, FulltextAnalyzer::English);
assert!(fulltext_options.case_sensitive);
}
#[test]
fn test_column_to_schema_with_vector_index() {
use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
// Test with custom metric and parameters
let column = Column {
column_def: ColumnDef {
name: "embedding".into(),
data_type: SqlDataType::Custom(
vec![Ident::new(VECTOR_TYPE_NAME)].into(),
vec!["128".to_string()],
),
options: vec![],
},
extensions: ColumnExtensions {
fulltext_index_options: None,
vector_options: None,
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: Some(OptionMap::from([
("metric".to_string(), "cosine".to_string()),
("connectivity".to_string(), "32".to_string()),
("expansion_add".to_string(), "200".to_string()),
("expansion_search".to_string(), "100".to_string()),
])),
},
};
let column_schema = column_to_schema(&column, "ts", None).unwrap();
assert_eq!("embedding", column_schema.name);
assert!(column_schema.is_vector_indexed());
let vector_options = column_schema.vector_index_options().unwrap().unwrap();
assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch);
assert_eq!(vector_options.metric, VectorDistanceMetric::Cosine);
assert_eq!(vector_options.connectivity, 32);
assert_eq!(vector_options.expansion_add, 200);
assert_eq!(vector_options.expansion_search, 100);
}
#[test]
fn test_column_to_schema_with_vector_index_defaults() {
use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
// Test with default values (empty options map)
let column = Column {
column_def: ColumnDef {
name: "vec".into(),
data_type: SqlDataType::Custom(
vec![Ident::new(VECTOR_TYPE_NAME)].into(),
vec!["64".to_string()],
),
options: vec![],
},
extensions: ColumnExtensions {
fulltext_index_options: None,
vector_options: None,
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: Some(OptionMap::default()),
},
};
let column_schema = column_to_schema(&column, "ts", None).unwrap();
assert_eq!("vec", column_schema.name);
assert!(column_schema.is_vector_indexed());
let vector_options = column_schema.vector_index_options().unwrap().unwrap();
// Verify defaults
assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch);
assert_eq!(vector_options.metric, VectorDistanceMetric::L2sq);
assert_eq!(vector_options.connectivity, 16);
assert_eq!(vector_options.expansion_add, 128);
assert_eq!(vector_options.expansion_search, 64);
}
}

View File

@@ -17,7 +17,10 @@ use std::fmt::{Display, Formatter};
use common_catalog::consts::FILE_ENGINE;
use datatypes::json::JsonStructureSettings;
use datatypes::schema::{FulltextOptions, SkippingIndexOptions};
use datatypes::schema::{
FulltextOptions, SkippingIndexOptions, VectorDistanceMetric, VectorIndexEngineType,
VectorIndexOptions,
};
use itertools::Itertools;
use serde::Serialize;
use snafu::ResultExt;
@@ -133,6 +136,8 @@ pub struct ColumnExtensions {
///
/// Inverted index doesn't have options at present. There won't be any options in that map.
pub inverted_index_options: Option<OptionMap>,
/// Vector index options for HNSW-based vector similarity search.
pub vector_index_options: Option<OptionMap>,
pub json_datatype_options: Option<OptionMap>,
}
@@ -208,6 +213,15 @@ impl Display for Column {
write!(f, " INVERTED INDEX")?;
}
}
if let Some(vector_index_options) = &self.extensions.vector_index_options {
if !vector_index_options.is_empty() {
let options = vector_index_options.kv_pairs();
write!(f, " VECTOR INDEX WITH({})", format_list_comma!(options))?;
} else {
write!(f, " VECTOR INDEX")?;
}
}
Ok(())
}
}
@@ -233,6 +247,89 @@ impl ColumnExtensions {
))
}
pub fn build_vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
let Some(options) = self.vector_index_options.as_ref() else {
return Ok(None);
};
let options_map: HashMap<String, String> = options.clone().into_map();
let mut result = VectorIndexOptions::default();
if let Some(s) = options_map.get("engine") {
result.engine = s.parse::<VectorIndexEngineType>().map_err(|e| {
InvalidSqlSnafu {
msg: format!("invalid VECTOR INDEX engine: {e}"),
}
.build()
})?;
}
if let Some(s) = options_map.get("metric") {
result.metric = s.parse::<VectorDistanceMetric>().map_err(|e| {
InvalidSqlSnafu {
msg: format!("invalid VECTOR INDEX metric: {e}"),
}
.build()
})?;
}
if let Some(s) = options_map.get("connectivity") {
let value = s.parse::<u32>().map_err(|_| {
InvalidSqlSnafu {
msg: format!(
"invalid VECTOR INDEX connectivity: {s}, expected positive integer"
),
}
.build()
})?;
if !(2..=2048).contains(&value) {
return InvalidSqlSnafu {
msg: "VECTOR INDEX connectivity must be in the range [2, 2048].".to_string(),
}
.fail();
}
result.connectivity = value;
}
if let Some(s) = options_map.get("expansion_add") {
let value = s.parse::<u32>().map_err(|_| {
InvalidSqlSnafu {
msg: format!(
"invalid VECTOR INDEX expansion_add: {s}, expected positive integer"
),
}
.build()
})?;
if value == 0 {
return InvalidSqlSnafu {
msg: "VECTOR INDEX expansion_add must be greater than 0".to_string(),
}
.fail();
}
result.expansion_add = value;
}
if let Some(s) = options_map.get("expansion_search") {
let value = s.parse::<u32>().map_err(|_| {
InvalidSqlSnafu {
msg: format!(
"invalid VECTOR INDEX expansion_search: {s}, expected positive integer"
),
}
.build()
})?;
if value == 0 {
return InvalidSqlSnafu {
msg: "VECTOR INDEX expansion_search must be greater than 0".to_string(),
}
.fail();
}
result.expansion_search = value;
}
Ok(Some(result))
}
pub fn build_json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
let Some(options) = self.json_datatype_options.as_ref() else {
return Ok(None);
@@ -893,4 +990,92 @@ AS SELECT number FROM numbers_input where number > 10"#,
_ => unreachable!(),
}
}
#[test]
fn test_vector_index_options_validation() {
use super::{ColumnExtensions, OptionMap};
// Test zero connectivity should fail
let extensions = ColumnExtensions {
fulltext_index_options: None,
vector_options: None,
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: Some(OptionMap::from([(
"connectivity".to_string(),
"0".to_string(),
)])),
};
let result = extensions.build_vector_index_options();
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("connectivity must be in the range [2, 2048]")
);
// Test zero expansion_add should fail
let extensions = ColumnExtensions {
fulltext_index_options: None,
vector_options: None,
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: Some(OptionMap::from([(
"expansion_add".to_string(),
"0".to_string(),
)])),
};
let result = extensions.build_vector_index_options();
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("expansion_add must be greater than 0")
);
// Test zero expansion_search should fail
let extensions = ColumnExtensions {
fulltext_index_options: None,
vector_options: None,
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: Some(OptionMap::from([(
"expansion_search".to_string(),
"0".to_string(),
)])),
};
let result = extensions.build_vector_index_options();
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("expansion_search must be greater than 0")
);
// Test valid values should succeed
let extensions = ColumnExtensions {
fulltext_index_options: None,
vector_options: None,
skipping_index_options: None,
inverted_index_options: None,
json_datatype_options: None,
vector_index_options: Some(OptionMap::from([
("connectivity".to_string(), "32".to_string()),
("expansion_add".to_string(), "200".to_string()),
("expansion_search".to_string(), "100".to_string()),
])),
};
let result = extensions.build_vector_index_options();
assert!(result.is_ok());
let options = result.unwrap().unwrap();
assert_eq!(options.connectivity, 32);
assert_eq!(options.expansion_add, 200);
assert_eq!(options.expansion_search, 100);
}
}

View File

@@ -27,5 +27,8 @@ pub use datatypes::schema::{
pub use self::descriptors::*;
pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, IndexVersion, ParseIdError};
pub use self::requests::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
pub use self::requests::{
ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector, VectorDistanceMetric,
VectorIndexEngine, VectorIndexEngineType, VectorSearchMatches, VectorSearchRequest,
};
pub use self::types::{SequenceNumber, SequenceRange};

View File

@@ -14,11 +14,66 @@
use std::fmt::{Display, Formatter};
use common_error::ext::BoxedError;
use common_recordbatch::OrderOption;
use datafusion_expr::expr::Expr;
// Re-export vector types from datatypes to avoid duplication
pub use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
use strum::Display;
use crate::storage::SequenceNumber;
use crate::storage::{ColumnId, SequenceNumber};
/// A hint for KNN vector search.
#[derive(Debug, Clone, PartialEq)]
pub struct VectorSearchRequest {
/// Column ID of the vector column to search.
pub column_id: ColumnId,
/// The query vector to search for.
pub query_vector: Vec<f32>,
/// Number of nearest neighbors to return.
pub k: usize,
/// Distance metric to use (matches the index metric).
pub metric: VectorDistanceMetric,
}
/// Search results from vector index.
#[derive(Debug, Clone, PartialEq)]
pub struct VectorSearchMatches {
/// Keys (row offsets in the index).
pub keys: Vec<u64>,
/// Distances from the query vector.
pub distances: Vec<f32>,
}
/// Trait for vector index engines (HNSW implementations).
///
/// This trait defines the interface for pluggable vector index engines.
/// Implementations (e.g., UsearchEngine) are provided by storage engines like mito2.
pub trait VectorIndexEngine: Send + Sync {
/// Adds a vector with the given key.
fn add(&mut self, key: u64, vector: &[f32]) -> Result<(), BoxedError>;
/// Searches for k nearest neighbors.
fn search(&self, query: &[f32], k: usize) -> Result<VectorSearchMatches, BoxedError>;
/// Returns the serialized length.
fn serialized_length(&self) -> usize;
/// Serializes the index to a buffer.
fn save_to_buffer(&self, buffer: &mut [u8]) -> Result<(), BoxedError>;
/// Reserves capacity for vectors.
fn reserve(&mut self, capacity: usize) -> Result<(), BoxedError>;
/// Returns current size (number of vectors).
fn size(&self) -> usize;
/// Returns current capacity.
fn capacity(&self) -> usize;
/// Returns memory usage in bytes.
fn memory_usage(&self) -> usize;
}
/// A hint on how to select rows from a time-series.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
@@ -38,7 +93,7 @@ pub enum TimeSeriesDistribution {
PerSeries,
}
#[derive(Default, Clone, Debug, PartialEq, Eq)]
#[derive(Default, Clone, Debug, PartialEq)]
pub struct ScanRequest {
/// Indices of columns to read, `None` to read all columns. This indices is
/// based on table schema.
@@ -66,6 +121,9 @@ pub struct ScanRequest {
pub sst_min_sequence: Option<SequenceNumber>,
/// Optional hint for the distribution of time-series data.
pub distribution: Option<TimeSeriesDistribution>,
/// Optional hint for KNN vector search. When set, the scan should use
/// vector index to find the k nearest neighbors.
pub vector_search: Option<VectorSearchRequest>,
}
impl Display for ScanRequest {
@@ -138,6 +196,16 @@ impl Display for ScanRequest {
if let Some(distribution) = &self.distribution {
write!(f, "{}distribution: {}", delimiter.as_str(), distribution)?;
}
if let Some(vector_search) = &self.vector_search {
write!(
f,
"{}vector_search: column_id={}, k={}, metric={}",
delimiter.as_str(),
vector_search.column_id,
vector_search.k,
vector_search.metric
)?;
}
write!(f, " }}")
}
}