From a35a39f726a5e01308442968743c84648a10e17e Mon Sep 17 00:00:00 2001 From: dennis zhuang Date: Wed, 17 Dec 2025 06:45:36 +0800 Subject: [PATCH] feat(vector_index): adds the foundational types and SQL parsing support for vector index (#7366) * feat: adds the foundational types and SQL parsing support for vector index Signed-off-by: Dennis Zhuang * refactor: by suggestions Signed-off-by: Dennis Zhuang * fix: ensure index option values must be greater than zero Signed-off-by: Dennis Zhuang * chore: validate connectivity strictly Signed-off-by: Dennis Zhuang * fix: compile error Signed-off-by: Dennis Zhuang * feat: disable SIMD for ci Signed-off-by: Dennis Zhuang --------- Signed-off-by: Dennis Zhuang --- Cargo.lock | 111 ++++++++- .../src/system_schema/information_schema.rs | 2 +- src/datatypes/src/schema.rs | 3 +- src/datatypes/src/schema/column_schema.rs | 224 +++++++++++++++++ src/index/Cargo.toml | 3 + src/index/src/lib.rs | 1 + src/index/src/vector.rs | 163 ++++++++++++ src/sql/src/error.rs | 11 +- src/sql/src/parsers/create_parser.rs | 233 +++++++++++++++++- src/sql/src/parsers/utils.rs | 23 ++ src/sql/src/statements.rs | 87 ++++++- src/sql/src/statements/create.rs | 187 +++++++++++++- src/store-api/src/storage.rs | 5 +- src/store-api/src/storage/requests.rs | 72 +++++- 14 files changed, 1113 insertions(+), 12 deletions(-) create mode 100644 src/index/src/vector.rs diff --git a/Cargo.lock b/Cargo.lock index 673a2ee2bc..281cbabb5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1977,6 +1977,17 @@ dependencies = [ "unicode-width 0.2.1", ] +[[package]] +name = "codespan-reporting" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681" +dependencies = [ + "serde", + "termcolor", + "unicode-width 0.2.1", +] + [[package]] name = "colorchoice" version = "1.0.4" @@ -3169,6 +3180,68 @@ dependencies = [ "cipher", ] +[[package]] +name = "cxx" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7620f6cfc4dcca21f2b085b7a890e16c60fd66f560cd69ee60594908dc72ab1" +dependencies = [ + "cc", + "cxx-build", + "cxxbridge-cmd", + "cxxbridge-flags", + "cxxbridge-macro", + "foldhash 0.2.0", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a9bc1a22964ff6a355fbec24cf68266a0ed28f8b84c0864c386474ea3d0e479" +dependencies = [ + "cc", + "codespan-reporting 0.13.1", + "indexmap 2.11.4", + "proc-macro2", + "quote", + "scratch", + "syn 2.0.106", +] + +[[package]] +name = "cxxbridge-cmd" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f29a879d35f7906e3c9b77d7a1005a6a0787d330c09dfe4ffb5f617728cb44" +dependencies = [ + "clap 4.5.40", + "codespan-reporting 0.13.1", + "indexmap 2.11.4", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d67109015f93f683e364085aa6489a5b2118b4a40058482101d699936a7836d6" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d187e019e7b05a1f3e69a8396b70800ee867aa9fc2ab972761173ccee03742df" +dependencies = [ + "indexmap 2.11.4", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "darling" version = "0.14.4" @@ -4916,6 +4989,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -5516,7 +5595,7 @@ checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -6161,6 +6240,7 @@ dependencies = [ "common-telemetry", "common-test-util", "criterion 0.4.0", + "datatypes", "fastbloom", "fst", "futures", @@ -6169,6 +6249,7 @@ dependencies = [ "jieba-rs", "lazy_static", "mockall", + "nalgebra", "pin-project", "prost 0.13.5", "puffin", @@ -6186,6 +6267,7 @@ dependencies = [ "tempfile", "tokio", "tokio-util", + "usearch", "uuid", ] @@ -7017,6 +7099,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "link-cplusplus" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82" +dependencies = [ + "cc", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -11290,6 +11381,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scratch" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" + [[package]] name = "scrypt" version = "0.11.0" @@ -14143,6 +14240,16 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "usearch" +version = "2.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc9fc5f872a3a4f9081d5f42624d788231b763e1846c829b9968a3755ac884d" +dependencies = [ + "cxx", + "cxx-build", +] + [[package]] name = "utf8-ranges" version = "1.0.5" @@ -14282,7 +14389,7 @@ dependencies = [ "ciborium", "cidr", "clap 4.5.40", - "codespan-reporting", + "codespan-reporting 0.12.0", "community-id", "convert_case 0.7.1", "crc", diff --git a/src/catalog/src/system_schema/information_schema.rs b/src/catalog/src/system_schema/information_schema.rs index 18384b8163..9715aa9402 100644 --- a/src/catalog/src/system_schema/information_schema.rs +++ b/src/catalog/src/system_schema/information_schema.rs @@ -428,7 +428,7 @@ pub trait InformationExtension { } /// The request to inspect the datanode. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub struct DatanodeInspectRequest { /// Kind to fetch from datanode. pub kind: DatanodeInspectKind, diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index 812b3c3b22..b5451617f8 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -33,7 +33,8 @@ pub use crate::schema::column_schema::{ COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY, COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata, - SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, + SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, VECTOR_INDEX_KEY, + VectorDistanceMetric, VectorIndexEngineType, VectorIndexOptions, }; pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::raw::RawSchema; diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs index 9272ba4b21..38cdd7bb06 100644 --- a/src/datatypes/src/schema/column_schema.rs +++ b/src/datatypes/src/schema/column_schema.rs @@ -46,6 +46,8 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext"; pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index"; /// Key used to store skip options in arrow field's metadata. pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index"; +/// Key used to store vector index options in arrow field's metadata. +pub const VECTOR_INDEX_KEY: &str = "greptime:vector_index"; /// Keys used in fulltext options pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable"; @@ -216,6 +218,53 @@ impl ColumnSchema { self.metadata.contains_key(INVERTED_INDEX_KEY) } + /// Checks if this column has a vector index. + pub fn is_vector_indexed(&self) -> bool { + match self.vector_index_options() { + Ok(opts) => opts.is_some(), + Err(e) => { + common_telemetry::warn!( + "Failed to deserialize vector_index_options for column '{}': {}", + self.name, + e + ); + false + } + } + } + + /// Gets the vector index options. + pub fn vector_index_options(&self) -> Result> { + match self.metadata.get(VECTOR_INDEX_KEY) { + None => Ok(None), + Some(json) => { + let options = + serde_json::from_str(json).context(error::DeserializeSnafu { json })?; + Ok(Some(options)) + } + } + } + + /// Sets the vector index options. + pub fn set_vector_index_options(&mut self, options: &VectorIndexOptions) -> Result<()> { + self.metadata.insert( + VECTOR_INDEX_KEY.to_string(), + serde_json::to_string(options).context(error::SerializeSnafu)?, + ); + Ok(()) + } + + /// Removes the vector index options. + pub fn unset_vector_index_options(&mut self) { + self.metadata.remove(VECTOR_INDEX_KEY); + } + + /// Sets vector index options and returns self for chaining. + pub fn with_vector_index_options(mut self, options: &VectorIndexOptions) -> Result { + self.set_vector_index_options(options)?; + Ok(self) + } + /// Set default constraint. /// /// If a default constraint exists for the column, this method will @@ -964,6 +1013,181 @@ impl TryFrom> for SkippingIndexOptions { } } +/// Distance metric for vector similarity search. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)] +#[serde(rename_all = "lowercase")] +pub enum VectorDistanceMetric { + /// Squared Euclidean distance (L2^2). + #[default] + L2sq, + /// Cosine distance (1 - cosine similarity). + Cosine, + /// Inner product (negative, for maximum inner product search). + #[serde(alias = "ip")] + InnerProduct, +} + +impl fmt::Display for VectorDistanceMetric { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + VectorDistanceMetric::L2sq => write!(f, "l2sq"), + VectorDistanceMetric::Cosine => write!(f, "cosine"), + VectorDistanceMetric::InnerProduct => write!(f, "ip"), + } + } +} + +impl std::str::FromStr for VectorDistanceMetric { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "l2sq" | "l2" | "euclidean" => Ok(VectorDistanceMetric::L2sq), + "cosine" | "cos" => Ok(VectorDistanceMetric::Cosine), + "inner_product" | "ip" | "dot" => Ok(VectorDistanceMetric::InnerProduct), + _ => Err(format!( + "Unknown distance metric: {}. Expected: l2sq, cosine, or ip", + s + )), + } + } +} + +impl VectorDistanceMetric { + /// Returns the metric as u8 for blob serialization. + pub fn as_u8(&self) -> u8 { + match self { + Self::L2sq => 0, + Self::Cosine => 1, + Self::InnerProduct => 2, + } + } + + /// Parses metric from u8 (used when reading blob). + pub fn try_from_u8(v: u8) -> Option { + match v { + 0 => Some(Self::L2sq), + 1 => Some(Self::Cosine), + 2 => Some(Self::InnerProduct), + _ => None, + } + } +} + +/// Default HNSW connectivity parameter. +const DEFAULT_VECTOR_INDEX_CONNECTIVITY: u32 = 16; +/// Default expansion factor during index construction. +const DEFAULT_VECTOR_INDEX_EXPANSION_ADD: u32 = 128; +/// Default expansion factor during search. +const DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH: u32 = 64; + +fn default_vector_index_connectivity() -> u32 { + DEFAULT_VECTOR_INDEX_CONNECTIVITY +} + +fn default_vector_index_expansion_add() -> u32 { + DEFAULT_VECTOR_INDEX_EXPANSION_ADD +} + +fn default_vector_index_expansion_search() -> u32 { + DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH +} + +/// Supported vector index engine types. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Visit, VisitMut)] +#[serde(rename_all = "lowercase")] +pub enum VectorIndexEngineType { + /// USearch HNSW implementation. + #[default] + Usearch, + // Future: Vsag, +} + +impl VectorIndexEngineType { + /// Returns the engine type as u8 for blob serialization. + pub fn as_u8(&self) -> u8 { + match self { + Self::Usearch => 0, + } + } + + /// Parses engine type from u8 (used when reading blob). + pub fn try_from_u8(v: u8) -> Option { + match v { + 0 => Some(Self::Usearch), + _ => None, + } + } +} + +impl fmt::Display for VectorIndexEngineType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Usearch => write!(f, "usearch"), + } + } +} + +impl std::str::FromStr for VectorIndexEngineType { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "usearch" => Ok(Self::Usearch), + _ => Err(format!( + "Unknown vector index engine: {}. Expected: usearch", + s + )), + } + } +} + +/// Options for vector index (HNSW). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)] +#[serde(rename_all = "kebab-case")] +pub struct VectorIndexOptions { + /// Vector index engine type (default: usearch). + #[serde(default)] + pub engine: VectorIndexEngineType, + /// Distance metric for similarity search. + #[serde(default)] + pub metric: VectorDistanceMetric, + /// HNSW connectivity parameter (M in the paper). + /// Higher values improve recall but increase memory usage. + #[serde(default = "default_vector_index_connectivity")] + pub connectivity: u32, + /// Expansion factor during index construction (ef_construction). + /// Higher values improve index quality but slow down construction. + #[serde(default = "default_vector_index_expansion_add")] + pub expansion_add: u32, + /// Expansion factor during search (ef_search). + /// Higher values improve recall but slow down search. + #[serde(default = "default_vector_index_expansion_search")] + pub expansion_search: u32, +} + +impl Default for VectorIndexOptions { + fn default() -> Self { + Self { + engine: VectorIndexEngineType::default(), + metric: VectorDistanceMetric::default(), + connectivity: DEFAULT_VECTOR_INDEX_CONNECTIVITY, + expansion_add: DEFAULT_VECTOR_INDEX_EXPANSION_ADD, + expansion_search: DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH, + } + } +} + +impl fmt::Display for VectorIndexOptions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "engine={}, metric={}, connectivity={}, expansion_add={}, expansion_search={}", + self.engine, self.metric, self.connectivity, self.expansion_add, self.expansion_search + ) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/src/index/Cargo.toml b/src/index/Cargo.toml index bde6959b89..03fa4da4e8 100644 --- a/src/index/Cargo.toml +++ b/src/index/Cargo.toml @@ -17,6 +17,7 @@ common-error.workspace = true common-macro.workspace = true common-runtime.workspace = true common-telemetry.workspace = true +datatypes.workspace = true fastbloom = "0.8" fst.workspace = true futures.workspace = true @@ -25,6 +26,7 @@ itertools.workspace = true jieba-rs = "0.8" lazy_static.workspace = true mockall.workspace = true +nalgebra.workspace = true pin-project.workspace = true prost.workspace = true puffin.workspace = true @@ -39,6 +41,7 @@ tantivy = { version = "0.24", features = ["zstd-compression"] } tantivy-jieba = "0.16" tokio.workspace = true tokio-util.workspace = true +usearch = { version = "2.21", default-features = false, features = ["fp16lib"] } uuid.workspace = true [dev-dependencies] diff --git a/src/index/src/lib.rs b/src/index/src/lib.rs index 547f880bb4..f4f299bef6 100644 --- a/src/index/src/lib.rs +++ b/src/index/src/lib.rs @@ -22,6 +22,7 @@ pub mod external_provider; pub mod fulltext_index; pub mod inverted_index; pub mod target; +pub mod vector; pub type Bytes = Vec; pub type BytesRef<'a> = &'a [u8]; diff --git a/src/index/src/vector.rs b/src/index/src/vector.rs new file mode 100644 index 0000000000..77c844f610 --- /dev/null +++ b/src/index/src/vector.rs @@ -0,0 +1,163 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Vector index types and options. +//! +//! This module re-exports types from `datatypes` and provides conversions +//! to USearch types, as well as distance computation functions. + +pub use datatypes::schema::{VectorDistanceMetric, VectorIndexOptions}; +use nalgebra::DVectorView; +pub use usearch::MetricKind; + +/// Converts a VectorDistanceMetric to a USearch MetricKind. +pub fn distance_metric_to_usearch(metric: VectorDistanceMetric) -> MetricKind { + match metric { + VectorDistanceMetric::L2sq => MetricKind::L2sq, + VectorDistanceMetric::Cosine => MetricKind::Cos, + VectorDistanceMetric::InnerProduct => MetricKind::IP, + } +} + +/// Computes distance between two vectors using the specified metric. +/// +/// Uses SIMD-optimized implementations via nalgebra. +/// +/// **Note:** The caller must ensure that the two vectors have the same length +/// and are non-empty. Empty vectors return 0.0 for all metrics. +pub fn compute_distance(v1: &[f32], v2: &[f32], metric: VectorDistanceMetric) -> f32 { + // Empty vectors are degenerate; return 0.0 uniformly across all metrics. + if v1.is_empty() || v2.is_empty() { + return 0.0; + } + + match metric { + VectorDistanceMetric::L2sq => l2sq(v1, v2), + VectorDistanceMetric::Cosine => cosine(v1, v2), + VectorDistanceMetric::InnerProduct => -dot(v1, v2), + } +} + +/// Calculates the squared L2 distance between two vectors. +fn l2sq(lhs: &[f32], rhs: &[f32]) -> f32 { + let lhs = DVectorView::from_slice(lhs, lhs.len()); + let rhs = DVectorView::from_slice(rhs, rhs.len()); + (lhs - rhs).norm_squared() +} + +/// Calculates the cosine distance between two vectors. +/// +/// Returns a value in `[0.0, 2.0]` where 0.0 means identical direction and 2.0 means +/// opposite direction. For degenerate cases (zero or near-zero magnitude vectors), +/// returns 1.0 (maximum uncertainty) to avoid NaN and ensure safe index operations. +fn cosine(lhs: &[f32], rhs: &[f32]) -> f32 { + let lhs_vec = DVectorView::from_slice(lhs, lhs.len()); + let rhs_vec = DVectorView::from_slice(rhs, rhs.len()); + + let dot_product = lhs_vec.dot(&rhs_vec); + let lhs_norm = lhs_vec.norm(); + let rhs_norm = rhs_vec.norm(); + + // Zero-magnitude vectors have undefined direction; return max distance as safe fallback. + if dot_product.abs() < f32::EPSILON + || lhs_norm.abs() < f32::EPSILON + || rhs_norm.abs() < f32::EPSILON + { + return 1.0; + } + + let cos_similar = dot_product / (lhs_norm * rhs_norm); + let res = 1.0 - cos_similar; + // Clamp near-zero results to exactly 0.0 to avoid floating-point artifacts. + if res.abs() < f32::EPSILON { 0.0 } else { res } +} + +/// Calculates the dot product between two vectors. +fn dot(lhs: &[f32], rhs: &[f32]) -> f32 { + let lhs = DVectorView::from_slice(lhs, lhs.len()); + let rhs = DVectorView::from_slice(rhs, rhs.len()); + lhs.dot(&rhs) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_distance_metric_to_usearch() { + assert_eq!( + distance_metric_to_usearch(VectorDistanceMetric::L2sq), + MetricKind::L2sq + ); + assert_eq!( + distance_metric_to_usearch(VectorDistanceMetric::Cosine), + MetricKind::Cos + ); + assert_eq!( + distance_metric_to_usearch(VectorDistanceMetric::InnerProduct), + MetricKind::IP + ); + } + + #[test] + fn test_vector_index_options_default() { + let options = VectorIndexOptions::default(); + assert_eq!(options.metric, VectorDistanceMetric::L2sq); + assert_eq!(options.connectivity, 16); + assert_eq!(options.expansion_add, 128); + assert_eq!(options.expansion_search, 64); + } + + #[test] + fn test_compute_distance_l2sq() { + let v1 = vec![1.0, 2.0, 3.0]; + let v2 = vec![4.0, 5.0, 6.0]; + // L2sq = (4-1)^2 + (5-2)^2 + (6-3)^2 = 9 + 9 + 9 = 27 + let dist = compute_distance(&v1, &v2, VectorDistanceMetric::L2sq); + assert!((dist - 27.0).abs() < 1e-6); + } + + #[test] + fn test_compute_distance_cosine() { + let v1 = vec![1.0, 0.0, 0.0]; + let v2 = vec![0.0, 1.0, 0.0]; + // Orthogonal vectors have cosine similarity of 0, distance of 1 + let dist = compute_distance(&v1, &v2, VectorDistanceMetric::Cosine); + assert!((dist - 1.0).abs() < 1e-6); + } + + #[test] + fn test_compute_distance_inner_product() { + let v1 = vec![1.0, 2.0, 3.0]; + let v2 = vec![4.0, 5.0, 6.0]; + // Inner product = 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32 + // Distance is negated: -32 + let dist = compute_distance(&v1, &v2, VectorDistanceMetric::InnerProduct); + assert!((dist - (-32.0)).abs() < 1e-6); + } + + #[test] + fn test_compute_distance_empty_vectors() { + // Empty vectors should return 0.0 uniformly for all metrics + assert_eq!(compute_distance(&[], &[], VectorDistanceMetric::L2sq), 0.0); + assert_eq!( + compute_distance(&[], &[], VectorDistanceMetric::Cosine), + 0.0 + ); + assert_eq!( + compute_distance(&[], &[], VectorDistanceMetric::InnerProduct), + 0.0 + ); + } +} diff --git a/src/sql/src/error.rs b/src/sql/src/error.rs index 46fbd29d1a..cb7a71f0e4 100644 --- a/src/sql/src/error.rs +++ b/src/sql/src/error.rs @@ -285,6 +285,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to set VECTOR index option"))] + SetVectorIndexOption { + source: datatypes::error::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "Invalid partition number: {}, should be in range [2, 65536]", partition_num @@ -394,7 +401,9 @@ impl ErrorExt for Error { ConvertValue { .. } => StatusCode::Unsupported, PermissionDenied { .. } => StatusCode::PermissionDenied, - SetFulltextOption { .. } | SetSkippingIndexOption { .. } => StatusCode::Unexpected, + SetFulltextOption { .. } + | SetSkippingIndexOption { .. } + | SetVectorIndexOption { .. } => StatusCode::Unexpected, } } diff --git a/src/sql/src/parsers/create_parser.rs b/src/sql/src/parsers/create_parser.rs index 53dcdb0e03..fe68a07669 100644 --- a/src/sql/src/parsers/create_parser.rs +++ b/src/sql/src/parsers/create_parser.rs @@ -43,6 +43,7 @@ use crate::parser::{FLOW, ParserContext}; use crate::parsers::tql_parser; use crate::parsers::utils::{ self, validate_column_fulltext_create_option, validate_column_skipping_index_create_option, + validate_column_vector_index_create_option, }; use crate::statements::create::{ Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable, @@ -60,6 +61,7 @@ pub const EXPIRE: &str = "EXPIRE"; pub const AFTER: &str = "AFTER"; pub const INVERTED: &str = "INVERTED"; pub const SKIPPING: &str = "SKIPPING"; +pub const VECTOR: &str = "VECTOR"; pub type RawIntervalExpr = String; @@ -928,6 +930,61 @@ impl<'a> ParserContext<'a> { is_index_declared |= true; } + // vector index + if let Token::Word(word) = parser.peek_token().token + && word.value.eq_ignore_ascii_case(VECTOR) + { + parser.next_token(); + // Consume `INDEX` keyword + ensure!( + parser.parse_keyword(Keyword::INDEX), + InvalidColumnOptionSnafu { + name: column_name.to_string(), + msg: "expect INDEX after VECTOR keyword", + } + ); + + ensure!( + column_extensions.vector_index_options.is_none(), + InvalidColumnOptionSnafu { + name: column_name.to_string(), + msg: "duplicated VECTOR INDEX option", + } + ); + + // Check that column is a vector type + let column_type = get_unalias_type(column_type); + let data_type = sql_data_type_to_concrete_data_type(&column_type, column_extensions)?; + ensure!( + matches!(data_type, ConcreteDataType::Vector(_)), + InvalidColumnOptionSnafu { + name: column_name.to_string(), + msg: "VECTOR INDEX only supports Vector type columns", + } + ); + + let options = parser + .parse_options(Keyword::WITH) + .context(error::SyntaxSnafu)? + .into_iter() + .map(parse_option_string) + .collect::>>()?; + + for (key, _) in options.iter() { + ensure!( + validate_column_vector_index_create_option(key), + InvalidColumnOptionSnafu { + name: column_name.to_string(), + msg: format!("invalid VECTOR INDEX option: {key}"), + } + ); + } + + let options = OptionMap::new(options); + column_extensions.vector_index_options = Some(options); + is_index_declared |= true; + } + Ok(is_index_declared) } @@ -2714,7 +2771,8 @@ CREATE TABLE log ( #[test] fn test_parse_column_extensions_vector() { - let sql = "VECTOR(128)"; + // Test that vector options are parsed from data_type (no additional SQL needed) + let sql = ""; let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, sql); let tokens = tokenizer.tokenize().unwrap(); @@ -2734,7 +2792,8 @@ CREATE TABLE log ( #[test] fn test_parse_column_extensions_vector_invalid() { - let sql = "VECTOR()"; + // Test that vector with no dimension fails + let sql = ""; let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, sql); let tokens = tokenizer.tokenize().unwrap(); @@ -2912,4 +2971,174 @@ CREATE TABLE log ( .unwrap(); assert_eq!("SELECT '10 seconds'::INTERVAL", &stmts[0].to_string()); } + + #[test] + fn test_parse_create_table_vector_index_options() { + // Test basic vector index + let sql = r" +CREATE TABLE vectors ( + ts TIMESTAMP TIME INDEX, + vec VECTOR(128) VECTOR INDEX, +)"; + let result = + ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()) + .unwrap(); + + if let Statement::CreateTable(c) = &result[0] { + c.columns.iter().for_each(|col| { + if col.name().value == "vec" { + assert!( + col.extensions + .vector_index_options + .as_ref() + .unwrap() + .is_empty() + ); + } + }); + } else { + panic!("should be create_table statement"); + } + + // Test vector index with options + let sql = r" +CREATE TABLE vectors ( + ts TIMESTAMP TIME INDEX, + vec VECTOR(128) VECTOR INDEX WITH (metric='cosine', connectivity='32', expansion_add='256', expansion_search='128') +)"; + let result = + ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()) + .unwrap(); + + if let Statement::CreateTable(c) = &result[0] { + c.columns.iter().for_each(|col| { + if col.name().value == "vec" { + let options = col.extensions.vector_index_options.as_ref().unwrap(); + assert_eq!(options.len(), 4); + assert_eq!(options.get("metric").unwrap(), "cosine"); + assert_eq!(options.get("connectivity").unwrap(), "32"); + assert_eq!(options.get("expansion_add").unwrap(), "256"); + assert_eq!(options.get("expansion_search").unwrap(), "128"); + } + }); + } else { + panic!("should be create_table statement"); + } + } + + #[test] + fn test_parse_create_table_vector_index_invalid_type() { + // Test vector index on non-vector type (should fail) + let sql = r" +CREATE TABLE vectors ( + ts TIMESTAMP TIME INDEX, + col INT VECTOR INDEX, +)"; + let result = + ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("VECTOR INDEX only supports Vector type columns") + ); + } + + #[test] + fn test_parse_create_table_vector_index_duplicate() { + // Test duplicate vector index (should fail) + let sql = r" +CREATE TABLE vectors ( + ts TIMESTAMP TIME INDEX, + vec VECTOR(128) VECTOR INDEX VECTOR INDEX, +)"; + let result = + ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("duplicated VECTOR INDEX option") + ); + } + + #[test] + fn test_parse_create_table_vector_index_invalid_option() { + // Test invalid option key (should fail) + let sql = r" +CREATE TABLE vectors ( + ts TIMESTAMP TIME INDEX, + vec VECTOR(128) VECTOR INDEX WITH (metric='l2sq', invalid_option='foo') +)"; + let result = + ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("invalid VECTOR INDEX option") + ); + } + + #[test] + fn test_parse_column_extensions_vector_index() { + // Test vector index on vector type + { + let sql = "VECTOR INDEX WITH (metric = 'l2sq')"; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize().unwrap(); + let mut parser = Parser::new(&dialect).with_tokens(tokens); + let name = Ident::new("vec_col"); + let data_type = + DataType::Custom(vec![Ident::new("VECTOR")].into(), vec!["128".to_string()]); + // First, parse the vector type to set vector_options + let mut extensions = ColumnExtensions { + vector_options: Some(OptionMap::from([( + VECTOR_OPT_DIM.to_string(), + "128".to_string(), + )])), + ..Default::default() + }; + + let result = ParserContext::parse_column_extensions( + &mut parser, + &name, + &data_type, + &mut extensions, + ); + assert!(result.is_ok()); + assert!(extensions.vector_index_options.is_some()); + let vi_options = extensions.vector_index_options.unwrap(); + assert_eq!(vi_options.get("metric"), Some("l2sq")); + } + + // Test vector index on non-vector type (should fail) + { + let sql = "VECTOR INDEX"; + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, sql); + let tokens = tokenizer.tokenize().unwrap(); + let mut parser = Parser::new(&dialect).with_tokens(tokens); + let name = Ident::new("num_col"); + let data_type = DataType::Int(None); // Non-vector type + let mut extensions = ColumnExtensions::default(); + let result = ParserContext::parse_column_extensions( + &mut parser, + &name, + &data_type, + &mut extensions, + ); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("VECTOR INDEX only supports Vector type columns") + ); + } + } } diff --git a/src/sql/src/parsers/utils.rs b/src/sql/src/parsers/utils.rs index 5938018082..74bb6bd803 100644 --- a/src/sql/src/parsers/utils.rs +++ b/src/sql/src/parsers/utils.rs @@ -222,6 +222,29 @@ pub fn validate_column_skipping_index_create_option(key: &str) -> bool { .contains(&key) } +/// Valid options for VECTOR INDEX: +/// - engine: Vector index engine (usearch) +/// - metric: Distance metric (l2sq, cosine, inner_product) +/// - connectivity: HNSW M parameter +/// - expansion_add: ef_construction parameter +/// - expansion_search: ef_search parameter +pub const COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE: &str = "engine"; +pub const COLUMN_VECTOR_INDEX_OPT_KEY_METRIC: &str = "metric"; +pub const COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY: &str = "connectivity"; +pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD: &str = "expansion_add"; +pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH: &str = "expansion_search"; + +pub fn validate_column_vector_index_create_option(key: &str) -> bool { + [ + COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE, + COLUMN_VECTOR_INDEX_OPT_KEY_METRIC, + COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY, + COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD, + COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH, + ] + .contains(&key) +} + /// Convert an [`IntervalMonthDayNano`] to a [`Duration`]. #[cfg(feature = "enterprise")] pub fn convert_month_day_nano_to_duration( diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs index 863291244b..a6ee60164a 100644 --- a/src/sql/src/statements.rs +++ b/src/sql/src/statements.rs @@ -55,7 +55,7 @@ use crate::ast::{ use crate::error::{ self, ConvertToGrpcDataTypeSnafu, ConvertValueSnafu, Result, SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetJsonStructureSettingsSnafu, - SetSkippingIndexOptionSnafu, SqlCommonSnafu, + SetSkippingIndexOptionSnafu, SetVectorIndexOptionSnafu, SqlCommonSnafu, }; use crate::statements::create::{Column, ColumnExtensions}; pub use crate::statements::option_map::OptionMap; @@ -147,6 +147,12 @@ pub fn column_to_schema( .context(SetSkippingIndexOptionSnafu)?; } + if let Some(options) = column.extensions.build_vector_index_options()? { + column_schema = column_schema + .with_vector_index_options(&options) + .context(SetVectorIndexOptionSnafu)?; + } + column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some()); if matches!(column.data_type(), SqlDataType::JSON) { @@ -710,6 +716,7 @@ mod tests { skipping_index_options: None, inverted_index_options: None, json_datatype_options: None, + vector_index_options: None, }, }; @@ -720,4 +727,82 @@ mod tests { assert_eq!(fulltext_options.analyzer, FulltextAnalyzer::English); assert!(fulltext_options.case_sensitive); } + + #[test] + fn test_column_to_schema_with_vector_index() { + use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType}; + + // Test with custom metric and parameters + let column = Column { + column_def: ColumnDef { + name: "embedding".into(), + data_type: SqlDataType::Custom( + vec![Ident::new(VECTOR_TYPE_NAME)].into(), + vec!["128".to_string()], + ), + options: vec![], + }, + extensions: ColumnExtensions { + fulltext_index_options: None, + vector_options: None, + skipping_index_options: None, + inverted_index_options: None, + json_datatype_options: None, + vector_index_options: Some(OptionMap::from([ + ("metric".to_string(), "cosine".to_string()), + ("connectivity".to_string(), "32".to_string()), + ("expansion_add".to_string(), "200".to_string()), + ("expansion_search".to_string(), "100".to_string()), + ])), + }, + }; + + let column_schema = column_to_schema(&column, "ts", None).unwrap(); + assert_eq!("embedding", column_schema.name); + assert!(column_schema.is_vector_indexed()); + + let vector_options = column_schema.vector_index_options().unwrap().unwrap(); + assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch); + assert_eq!(vector_options.metric, VectorDistanceMetric::Cosine); + assert_eq!(vector_options.connectivity, 32); + assert_eq!(vector_options.expansion_add, 200); + assert_eq!(vector_options.expansion_search, 100); + } + + #[test] + fn test_column_to_schema_with_vector_index_defaults() { + use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType}; + + // Test with default values (empty options map) + let column = Column { + column_def: ColumnDef { + name: "vec".into(), + data_type: SqlDataType::Custom( + vec![Ident::new(VECTOR_TYPE_NAME)].into(), + vec!["64".to_string()], + ), + options: vec![], + }, + extensions: ColumnExtensions { + fulltext_index_options: None, + vector_options: None, + skipping_index_options: None, + inverted_index_options: None, + json_datatype_options: None, + vector_index_options: Some(OptionMap::default()), + }, + }; + + let column_schema = column_to_schema(&column, "ts", None).unwrap(); + assert_eq!("vec", column_schema.name); + assert!(column_schema.is_vector_indexed()); + + let vector_options = column_schema.vector_index_options().unwrap().unwrap(); + // Verify defaults + assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch); + assert_eq!(vector_options.metric, VectorDistanceMetric::L2sq); + assert_eq!(vector_options.connectivity, 16); + assert_eq!(vector_options.expansion_add, 128); + assert_eq!(vector_options.expansion_search, 64); + } } diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs index 3c7f6d1731..3791effac0 100644 --- a/src/sql/src/statements/create.rs +++ b/src/sql/src/statements/create.rs @@ -17,7 +17,10 @@ use std::fmt::{Display, Formatter}; use common_catalog::consts::FILE_ENGINE; use datatypes::json::JsonStructureSettings; -use datatypes::schema::{FulltextOptions, SkippingIndexOptions}; +use datatypes::schema::{ + FulltextOptions, SkippingIndexOptions, VectorDistanceMetric, VectorIndexEngineType, + VectorIndexOptions, +}; use itertools::Itertools; use serde::Serialize; use snafu::ResultExt; @@ -133,6 +136,8 @@ pub struct ColumnExtensions { /// /// Inverted index doesn't have options at present. There won't be any options in that map. pub inverted_index_options: Option, + /// Vector index options for HNSW-based vector similarity search. + pub vector_index_options: Option, pub json_datatype_options: Option, } @@ -208,6 +213,15 @@ impl Display for Column { write!(f, " INVERTED INDEX")?; } } + + if let Some(vector_index_options) = &self.extensions.vector_index_options { + if !vector_index_options.is_empty() { + let options = vector_index_options.kv_pairs(); + write!(f, " VECTOR INDEX WITH({})", format_list_comma!(options))?; + } else { + write!(f, " VECTOR INDEX")?; + } + } Ok(()) } } @@ -233,6 +247,89 @@ impl ColumnExtensions { )) } + pub fn build_vector_index_options(&self) -> Result> { + let Some(options) = self.vector_index_options.as_ref() else { + return Ok(None); + }; + + let options_map: HashMap = options.clone().into_map(); + let mut result = VectorIndexOptions::default(); + + if let Some(s) = options_map.get("engine") { + result.engine = s.parse::().map_err(|e| { + InvalidSqlSnafu { + msg: format!("invalid VECTOR INDEX engine: {e}"), + } + .build() + })?; + } + + if let Some(s) = options_map.get("metric") { + result.metric = s.parse::().map_err(|e| { + InvalidSqlSnafu { + msg: format!("invalid VECTOR INDEX metric: {e}"), + } + .build() + })?; + } + + if let Some(s) = options_map.get("connectivity") { + let value = s.parse::().map_err(|_| { + InvalidSqlSnafu { + msg: format!( + "invalid VECTOR INDEX connectivity: {s}, expected positive integer" + ), + } + .build() + })?; + if !(2..=2048).contains(&value) { + return InvalidSqlSnafu { + msg: "VECTOR INDEX connectivity must be in the range [2, 2048].".to_string(), + } + .fail(); + } + result.connectivity = value; + } + + if let Some(s) = options_map.get("expansion_add") { + let value = s.parse::().map_err(|_| { + InvalidSqlSnafu { + msg: format!( + "invalid VECTOR INDEX expansion_add: {s}, expected positive integer" + ), + } + .build() + })?; + if value == 0 { + return InvalidSqlSnafu { + msg: "VECTOR INDEX expansion_add must be greater than 0".to_string(), + } + .fail(); + } + result.expansion_add = value; + } + + if let Some(s) = options_map.get("expansion_search") { + let value = s.parse::().map_err(|_| { + InvalidSqlSnafu { + msg: format!( + "invalid VECTOR INDEX expansion_search: {s}, expected positive integer" + ), + } + .build() + })?; + if value == 0 { + return InvalidSqlSnafu { + msg: "VECTOR INDEX expansion_search must be greater than 0".to_string(), + } + .fail(); + } + result.expansion_search = value; + } + + Ok(Some(result)) + } + pub fn build_json_structure_settings(&self) -> Result> { let Some(options) = self.json_datatype_options.as_ref() else { return Ok(None); @@ -893,4 +990,92 @@ AS SELECT number FROM numbers_input where number > 10"#, _ => unreachable!(), } } + + #[test] + fn test_vector_index_options_validation() { + use super::{ColumnExtensions, OptionMap}; + + // Test zero connectivity should fail + let extensions = ColumnExtensions { + fulltext_index_options: None, + vector_options: None, + skipping_index_options: None, + inverted_index_options: None, + json_datatype_options: None, + vector_index_options: Some(OptionMap::from([( + "connectivity".to_string(), + "0".to_string(), + )])), + }; + let result = extensions.build_vector_index_options(); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("connectivity must be in the range [2, 2048]") + ); + + // Test zero expansion_add should fail + let extensions = ColumnExtensions { + fulltext_index_options: None, + vector_options: None, + skipping_index_options: None, + inverted_index_options: None, + json_datatype_options: None, + vector_index_options: Some(OptionMap::from([( + "expansion_add".to_string(), + "0".to_string(), + )])), + }; + let result = extensions.build_vector_index_options(); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("expansion_add must be greater than 0") + ); + + // Test zero expansion_search should fail + let extensions = ColumnExtensions { + fulltext_index_options: None, + vector_options: None, + skipping_index_options: None, + inverted_index_options: None, + json_datatype_options: None, + vector_index_options: Some(OptionMap::from([( + "expansion_search".to_string(), + "0".to_string(), + )])), + }; + let result = extensions.build_vector_index_options(); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("expansion_search must be greater than 0") + ); + + // Test valid values should succeed + let extensions = ColumnExtensions { + fulltext_index_options: None, + vector_options: None, + skipping_index_options: None, + inverted_index_options: None, + json_datatype_options: None, + vector_index_options: Some(OptionMap::from([ + ("connectivity".to_string(), "32".to_string()), + ("expansion_add".to_string(), "200".to_string()), + ("expansion_search".to_string(), "100".to_string()), + ])), + }; + let result = extensions.build_vector_index_options(); + assert!(result.is_ok()); + let options = result.unwrap().unwrap(); + assert_eq!(options.connectivity, 32); + assert_eq!(options.expansion_add, 200); + assert_eq!(options.expansion_search, 100); + } } diff --git a/src/store-api/src/storage.rs b/src/store-api/src/storage.rs index 36b28b511c..b97fe0b3ad 100644 --- a/src/store-api/src/storage.rs +++ b/src/store-api/src/storage.rs @@ -27,5 +27,8 @@ pub use datatypes::schema::{ pub use self::descriptors::*; pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, IndexVersion, ParseIdError}; -pub use self::requests::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector}; +pub use self::requests::{ + ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector, VectorDistanceMetric, + VectorIndexEngine, VectorIndexEngineType, VectorSearchMatches, VectorSearchRequest, +}; pub use self::types::{SequenceNumber, SequenceRange}; diff --git a/src/store-api/src/storage/requests.rs b/src/store-api/src/storage/requests.rs index 5e9fae3215..e538127e73 100644 --- a/src/store-api/src/storage/requests.rs +++ b/src/store-api/src/storage/requests.rs @@ -14,11 +14,66 @@ use std::fmt::{Display, Formatter}; +use common_error::ext::BoxedError; use common_recordbatch::OrderOption; use datafusion_expr::expr::Expr; +// Re-export vector types from datatypes to avoid duplication +pub use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType}; use strum::Display; -use crate::storage::SequenceNumber; +use crate::storage::{ColumnId, SequenceNumber}; + +/// A hint for KNN vector search. +#[derive(Debug, Clone, PartialEq)] +pub struct VectorSearchRequest { + /// Column ID of the vector column to search. + pub column_id: ColumnId, + /// The query vector to search for. + pub query_vector: Vec, + /// Number of nearest neighbors to return. + pub k: usize, + /// Distance metric to use (matches the index metric). + pub metric: VectorDistanceMetric, +} + +/// Search results from vector index. +#[derive(Debug, Clone, PartialEq)] +pub struct VectorSearchMatches { + /// Keys (row offsets in the index). + pub keys: Vec, + /// Distances from the query vector. + pub distances: Vec, +} + +/// Trait for vector index engines (HNSW implementations). +/// +/// This trait defines the interface for pluggable vector index engines. +/// Implementations (e.g., UsearchEngine) are provided by storage engines like mito2. +pub trait VectorIndexEngine: Send + Sync { + /// Adds a vector with the given key. + fn add(&mut self, key: u64, vector: &[f32]) -> Result<(), BoxedError>; + + /// Searches for k nearest neighbors. + fn search(&self, query: &[f32], k: usize) -> Result; + + /// Returns the serialized length. + fn serialized_length(&self) -> usize; + + /// Serializes the index to a buffer. + fn save_to_buffer(&self, buffer: &mut [u8]) -> Result<(), BoxedError>; + + /// Reserves capacity for vectors. + fn reserve(&mut self, capacity: usize) -> Result<(), BoxedError>; + + /// Returns current size (number of vectors). + fn size(&self) -> usize; + + /// Returns current capacity. + fn capacity(&self) -> usize; + + /// Returns memory usage in bytes. + fn memory_usage(&self) -> usize; +} /// A hint on how to select rows from a time-series. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)] @@ -38,7 +93,7 @@ pub enum TimeSeriesDistribution { PerSeries, } -#[derive(Default, Clone, Debug, PartialEq, Eq)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct ScanRequest { /// Indices of columns to read, `None` to read all columns. This indices is /// based on table schema. @@ -66,6 +121,9 @@ pub struct ScanRequest { pub sst_min_sequence: Option, /// Optional hint for the distribution of time-series data. pub distribution: Option, + /// Optional hint for KNN vector search. When set, the scan should use + /// vector index to find the k nearest neighbors. + pub vector_search: Option, } impl Display for ScanRequest { @@ -138,6 +196,16 @@ impl Display for ScanRequest { if let Some(distribution) = &self.distribution { write!(f, "{}distribution: {}", delimiter.as_str(), distribution)?; } + if let Some(vector_search) = &self.vector_search { + write!( + f, + "{}vector_search: column_id={}, k={}, metric={}", + delimiter.as_str(), + vector_search.column_id, + vector_search.k, + vector_search.metric + )?; + } write!(f, " }}") } }