mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-25 23:49:58 +00:00
feat(vector_index): adds the foundational types and SQL parsing support for vector index (#7366)
* feat: adds the foundational types and SQL parsing support for vector index Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * refactor: by suggestions Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * fix: ensure index option values must be greater than zero Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * chore: validate connectivity strictly Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * fix: compile error Signed-off-by: Dennis Zhuang <killme2008@gmail.com> * feat: disable SIMD for ci Signed-off-by: Dennis Zhuang <killme2008@gmail.com> --------- Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
This commit is contained in:
111
Cargo.lock
generated
111
Cargo.lock
generated
@@ -1977,6 +1977,17 @@ dependencies = [
|
|||||||
"unicode-width 0.2.1",
|
"unicode-width 0.2.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "codespan-reporting"
|
||||||
|
version = "0.13.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
"termcolor",
|
||||||
|
"unicode-width 0.2.1",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorchoice"
|
name = "colorchoice"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
@@ -3169,6 +3180,68 @@ dependencies = [
|
|||||||
"cipher",
|
"cipher",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cxx"
|
||||||
|
version = "1.0.190"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a7620f6cfc4dcca21f2b085b7a890e16c60fd66f560cd69ee60594908dc72ab1"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"cxx-build",
|
||||||
|
"cxxbridge-cmd",
|
||||||
|
"cxxbridge-flags",
|
||||||
|
"cxxbridge-macro",
|
||||||
|
"foldhash 0.2.0",
|
||||||
|
"link-cplusplus",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cxx-build"
|
||||||
|
version = "1.0.190"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a9bc1a22964ff6a355fbec24cf68266a0ed28f8b84c0864c386474ea3d0e479"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"codespan-reporting 0.13.1",
|
||||||
|
"indexmap 2.11.4",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"scratch",
|
||||||
|
"syn 2.0.106",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cxxbridge-cmd"
|
||||||
|
version = "1.0.190"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b1f29a879d35f7906e3c9b77d7a1005a6a0787d330c09dfe4ffb5f617728cb44"
|
||||||
|
dependencies = [
|
||||||
|
"clap 4.5.40",
|
||||||
|
"codespan-reporting 0.13.1",
|
||||||
|
"indexmap 2.11.4",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.106",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cxxbridge-flags"
|
||||||
|
version = "1.0.190"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d67109015f93f683e364085aa6489a5b2118b4a40058482101d699936a7836d6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cxxbridge-macro"
|
||||||
|
version = "1.0.190"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d187e019e7b05a1f3e69a8396b70800ee867aa9fc2ab972761173ccee03742df"
|
||||||
|
dependencies = [
|
||||||
|
"indexmap 2.11.4",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.106",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "darling"
|
name = "darling"
|
||||||
version = "0.14.4"
|
version = "0.14.4"
|
||||||
@@ -4916,6 +4989,12 @@ version = "0.1.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "foldhash"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "form_urlencoded"
|
name = "form_urlencoded"
|
||||||
version = "1.2.2"
|
version = "1.2.2"
|
||||||
@@ -5516,7 +5595,7 @@ checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"allocator-api2",
|
"allocator-api2",
|
||||||
"equivalent",
|
"equivalent",
|
||||||
"foldhash",
|
"foldhash 0.1.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6161,6 +6240,7 @@ dependencies = [
|
|||||||
"common-telemetry",
|
"common-telemetry",
|
||||||
"common-test-util",
|
"common-test-util",
|
||||||
"criterion 0.4.0",
|
"criterion 0.4.0",
|
||||||
|
"datatypes",
|
||||||
"fastbloom",
|
"fastbloom",
|
||||||
"fst",
|
"fst",
|
||||||
"futures",
|
"futures",
|
||||||
@@ -6169,6 +6249,7 @@ dependencies = [
|
|||||||
"jieba-rs",
|
"jieba-rs",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"mockall",
|
"mockall",
|
||||||
|
"nalgebra",
|
||||||
"pin-project",
|
"pin-project",
|
||||||
"prost 0.13.5",
|
"prost 0.13.5",
|
||||||
"puffin",
|
"puffin",
|
||||||
@@ -6186,6 +6267,7 @@ dependencies = [
|
|||||||
"tempfile",
|
"tempfile",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
|
"usearch",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -7017,6 +7099,15 @@ dependencies = [
|
|||||||
"vcpkg",
|
"vcpkg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "link-cplusplus"
|
||||||
|
version = "1.0.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "linked-hash-map"
|
name = "linked-hash-map"
|
||||||
version = "0.5.6"
|
version = "0.5.6"
|
||||||
@@ -11290,6 +11381,12 @@ version = "1.2.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "scratch"
|
||||||
|
version = "1.0.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "scrypt"
|
name = "scrypt"
|
||||||
version = "0.11.0"
|
version = "0.11.0"
|
||||||
@@ -14143,6 +14240,16 @@ version = "2.1.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "usearch"
|
||||||
|
version = "2.21.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2cc9fc5f872a3a4f9081d5f42624d788231b763e1846c829b9968a3755ac884d"
|
||||||
|
dependencies = [
|
||||||
|
"cxx",
|
||||||
|
"cxx-build",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8-ranges"
|
name = "utf8-ranges"
|
||||||
version = "1.0.5"
|
version = "1.0.5"
|
||||||
@@ -14282,7 +14389,7 @@ dependencies = [
|
|||||||
"ciborium",
|
"ciborium",
|
||||||
"cidr",
|
"cidr",
|
||||||
"clap 4.5.40",
|
"clap 4.5.40",
|
||||||
"codespan-reporting",
|
"codespan-reporting 0.12.0",
|
||||||
"community-id",
|
"community-id",
|
||||||
"convert_case 0.7.1",
|
"convert_case 0.7.1",
|
||||||
"crc",
|
"crc",
|
||||||
|
|||||||
@@ -428,7 +428,7 @@ pub trait InformationExtension {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The request to inspect the datanode.
|
/// The request to inspect the datanode.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub struct DatanodeInspectRequest {
|
pub struct DatanodeInspectRequest {
|
||||||
/// Kind to fetch from datanode.
|
/// Kind to fetch from datanode.
|
||||||
pub kind: DatanodeInspectKind,
|
pub kind: DatanodeInspectKind,
|
||||||
|
|||||||
@@ -33,7 +33,8 @@ pub use crate::schema::column_schema::{
|
|||||||
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
|
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
|
||||||
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
|
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
|
||||||
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
|
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
|
||||||
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
|
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, VECTOR_INDEX_KEY,
|
||||||
|
VectorDistanceMetric, VectorIndexEngineType, VectorIndexOptions,
|
||||||
};
|
};
|
||||||
pub use crate::schema::constraint::ColumnDefaultConstraint;
|
pub use crate::schema::constraint::ColumnDefaultConstraint;
|
||||||
pub use crate::schema::raw::RawSchema;
|
pub use crate::schema::raw::RawSchema;
|
||||||
|
|||||||
@@ -46,6 +46,8 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
|
|||||||
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
|
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
|
||||||
/// Key used to store skip options in arrow field's metadata.
|
/// Key used to store skip options in arrow field's metadata.
|
||||||
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
|
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
|
||||||
|
/// Key used to store vector index options in arrow field's metadata.
|
||||||
|
pub const VECTOR_INDEX_KEY: &str = "greptime:vector_index";
|
||||||
|
|
||||||
/// Keys used in fulltext options
|
/// Keys used in fulltext options
|
||||||
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
|
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
|
||||||
@@ -216,6 +218,53 @@ impl ColumnSchema {
|
|||||||
self.metadata.contains_key(INVERTED_INDEX_KEY)
|
self.metadata.contains_key(INVERTED_INDEX_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Checks if this column has a vector index.
|
||||||
|
pub fn is_vector_indexed(&self) -> bool {
|
||||||
|
match self.vector_index_options() {
|
||||||
|
Ok(opts) => opts.is_some(),
|
||||||
|
Err(e) => {
|
||||||
|
common_telemetry::warn!(
|
||||||
|
"Failed to deserialize vector_index_options for column '{}': {}",
|
||||||
|
self.name,
|
||||||
|
e
|
||||||
|
);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets the vector index options.
|
||||||
|
pub fn vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
|
||||||
|
match self.metadata.get(VECTOR_INDEX_KEY) {
|
||||||
|
None => Ok(None),
|
||||||
|
Some(json) => {
|
||||||
|
let options =
|
||||||
|
serde_json::from_str(json).context(error::DeserializeSnafu { json })?;
|
||||||
|
Ok(Some(options))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets the vector index options.
|
||||||
|
pub fn set_vector_index_options(&mut self, options: &VectorIndexOptions) -> Result<()> {
|
||||||
|
self.metadata.insert(
|
||||||
|
VECTOR_INDEX_KEY.to_string(),
|
||||||
|
serde_json::to_string(options).context(error::SerializeSnafu)?,
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Removes the vector index options.
|
||||||
|
pub fn unset_vector_index_options(&mut self) {
|
||||||
|
self.metadata.remove(VECTOR_INDEX_KEY);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets vector index options and returns self for chaining.
|
||||||
|
pub fn with_vector_index_options(mut self, options: &VectorIndexOptions) -> Result<Self> {
|
||||||
|
self.set_vector_index_options(options)?;
|
||||||
|
Ok(self)
|
||||||
|
}
|
||||||
|
|
||||||
/// Set default constraint.
|
/// Set default constraint.
|
||||||
///
|
///
|
||||||
/// If a default constraint exists for the column, this method will
|
/// If a default constraint exists for the column, this method will
|
||||||
@@ -964,6 +1013,181 @@ impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Distance metric for vector similarity search.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum VectorDistanceMetric {
|
||||||
|
/// Squared Euclidean distance (L2^2).
|
||||||
|
#[default]
|
||||||
|
L2sq,
|
||||||
|
/// Cosine distance (1 - cosine similarity).
|
||||||
|
Cosine,
|
||||||
|
/// Inner product (negative, for maximum inner product search).
|
||||||
|
#[serde(alias = "ip")]
|
||||||
|
InnerProduct,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for VectorDistanceMetric {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
VectorDistanceMetric::L2sq => write!(f, "l2sq"),
|
||||||
|
VectorDistanceMetric::Cosine => write!(f, "cosine"),
|
||||||
|
VectorDistanceMetric::InnerProduct => write!(f, "ip"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::str::FromStr for VectorDistanceMetric {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||||
|
match s.to_lowercase().as_str() {
|
||||||
|
"l2sq" | "l2" | "euclidean" => Ok(VectorDistanceMetric::L2sq),
|
||||||
|
"cosine" | "cos" => Ok(VectorDistanceMetric::Cosine),
|
||||||
|
"inner_product" | "ip" | "dot" => Ok(VectorDistanceMetric::InnerProduct),
|
||||||
|
_ => Err(format!(
|
||||||
|
"Unknown distance metric: {}. Expected: l2sq, cosine, or ip",
|
||||||
|
s
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VectorDistanceMetric {
|
||||||
|
/// Returns the metric as u8 for blob serialization.
|
||||||
|
pub fn as_u8(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::L2sq => 0,
|
||||||
|
Self::Cosine => 1,
|
||||||
|
Self::InnerProduct => 2,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses metric from u8 (used when reading blob).
|
||||||
|
pub fn try_from_u8(v: u8) -> Option<Self> {
|
||||||
|
match v {
|
||||||
|
0 => Some(Self::L2sq),
|
||||||
|
1 => Some(Self::Cosine),
|
||||||
|
2 => Some(Self::InnerProduct),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Default HNSW connectivity parameter.
|
||||||
|
const DEFAULT_VECTOR_INDEX_CONNECTIVITY: u32 = 16;
|
||||||
|
/// Default expansion factor during index construction.
|
||||||
|
const DEFAULT_VECTOR_INDEX_EXPANSION_ADD: u32 = 128;
|
||||||
|
/// Default expansion factor during search.
|
||||||
|
const DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH: u32 = 64;
|
||||||
|
|
||||||
|
fn default_vector_index_connectivity() -> u32 {
|
||||||
|
DEFAULT_VECTOR_INDEX_CONNECTIVITY
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_vector_index_expansion_add() -> u32 {
|
||||||
|
DEFAULT_VECTOR_INDEX_EXPANSION_ADD
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_vector_index_expansion_search() -> u32 {
|
||||||
|
DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Supported vector index engine types.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Visit, VisitMut)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
pub enum VectorIndexEngineType {
|
||||||
|
/// USearch HNSW implementation.
|
||||||
|
#[default]
|
||||||
|
Usearch,
|
||||||
|
// Future: Vsag,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VectorIndexEngineType {
|
||||||
|
/// Returns the engine type as u8 for blob serialization.
|
||||||
|
pub fn as_u8(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::Usearch => 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses engine type from u8 (used when reading blob).
|
||||||
|
pub fn try_from_u8(v: u8) -> Option<Self> {
|
||||||
|
match v {
|
||||||
|
0 => Some(Self::Usearch),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for VectorIndexEngineType {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::Usearch => write!(f, "usearch"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::str::FromStr for VectorIndexEngineType {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||||
|
match s.to_lowercase().as_str() {
|
||||||
|
"usearch" => Ok(Self::Usearch),
|
||||||
|
_ => Err(format!(
|
||||||
|
"Unknown vector index engine: {}. Expected: usearch",
|
||||||
|
s
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Options for vector index (HNSW).
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
|
||||||
|
#[serde(rename_all = "kebab-case")]
|
||||||
|
pub struct VectorIndexOptions {
|
||||||
|
/// Vector index engine type (default: usearch).
|
||||||
|
#[serde(default)]
|
||||||
|
pub engine: VectorIndexEngineType,
|
||||||
|
/// Distance metric for similarity search.
|
||||||
|
#[serde(default)]
|
||||||
|
pub metric: VectorDistanceMetric,
|
||||||
|
/// HNSW connectivity parameter (M in the paper).
|
||||||
|
/// Higher values improve recall but increase memory usage.
|
||||||
|
#[serde(default = "default_vector_index_connectivity")]
|
||||||
|
pub connectivity: u32,
|
||||||
|
/// Expansion factor during index construction (ef_construction).
|
||||||
|
/// Higher values improve index quality but slow down construction.
|
||||||
|
#[serde(default = "default_vector_index_expansion_add")]
|
||||||
|
pub expansion_add: u32,
|
||||||
|
/// Expansion factor during search (ef_search).
|
||||||
|
/// Higher values improve recall but slow down search.
|
||||||
|
#[serde(default = "default_vector_index_expansion_search")]
|
||||||
|
pub expansion_search: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for VectorIndexOptions {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
engine: VectorIndexEngineType::default(),
|
||||||
|
metric: VectorDistanceMetric::default(),
|
||||||
|
connectivity: DEFAULT_VECTOR_INDEX_CONNECTIVITY,
|
||||||
|
expansion_add: DEFAULT_VECTOR_INDEX_EXPANSION_ADD,
|
||||||
|
expansion_search: DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for VectorIndexOptions {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"engine={}, metric={}, connectivity={}, expansion_add={}, expansion_search={}",
|
||||||
|
self.engine, self.metric, self.connectivity, self.expansion_add, self.expansion_search
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ common-error.workspace = true
|
|||||||
common-macro.workspace = true
|
common-macro.workspace = true
|
||||||
common-runtime.workspace = true
|
common-runtime.workspace = true
|
||||||
common-telemetry.workspace = true
|
common-telemetry.workspace = true
|
||||||
|
datatypes.workspace = true
|
||||||
fastbloom = "0.8"
|
fastbloom = "0.8"
|
||||||
fst.workspace = true
|
fst.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
@@ -25,6 +26,7 @@ itertools.workspace = true
|
|||||||
jieba-rs = "0.8"
|
jieba-rs = "0.8"
|
||||||
lazy_static.workspace = true
|
lazy_static.workspace = true
|
||||||
mockall.workspace = true
|
mockall.workspace = true
|
||||||
|
nalgebra.workspace = true
|
||||||
pin-project.workspace = true
|
pin-project.workspace = true
|
||||||
prost.workspace = true
|
prost.workspace = true
|
||||||
puffin.workspace = true
|
puffin.workspace = true
|
||||||
@@ -39,6 +41,7 @@ tantivy = { version = "0.24", features = ["zstd-compression"] }
|
|||||||
tantivy-jieba = "0.16"
|
tantivy-jieba = "0.16"
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
|
usearch = { version = "2.21", default-features = false, features = ["fp16lib"] }
|
||||||
uuid.workspace = true
|
uuid.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ pub mod external_provider;
|
|||||||
pub mod fulltext_index;
|
pub mod fulltext_index;
|
||||||
pub mod inverted_index;
|
pub mod inverted_index;
|
||||||
pub mod target;
|
pub mod target;
|
||||||
|
pub mod vector;
|
||||||
|
|
||||||
pub type Bytes = Vec<u8>;
|
pub type Bytes = Vec<u8>;
|
||||||
pub type BytesRef<'a> = &'a [u8];
|
pub type BytesRef<'a> = &'a [u8];
|
||||||
|
|||||||
163
src/index/src/vector.rs
Normal file
163
src/index/src/vector.rs
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
// Copyright 2023 Greptime Team
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
//! Vector index types and options.
|
||||||
|
//!
|
||||||
|
//! This module re-exports types from `datatypes` and provides conversions
|
||||||
|
//! to USearch types, as well as distance computation functions.
|
||||||
|
|
||||||
|
pub use datatypes::schema::{VectorDistanceMetric, VectorIndexOptions};
|
||||||
|
use nalgebra::DVectorView;
|
||||||
|
pub use usearch::MetricKind;
|
||||||
|
|
||||||
|
/// Converts a VectorDistanceMetric to a USearch MetricKind.
|
||||||
|
pub fn distance_metric_to_usearch(metric: VectorDistanceMetric) -> MetricKind {
|
||||||
|
match metric {
|
||||||
|
VectorDistanceMetric::L2sq => MetricKind::L2sq,
|
||||||
|
VectorDistanceMetric::Cosine => MetricKind::Cos,
|
||||||
|
VectorDistanceMetric::InnerProduct => MetricKind::IP,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes distance between two vectors using the specified metric.
|
||||||
|
///
|
||||||
|
/// Uses SIMD-optimized implementations via nalgebra.
|
||||||
|
///
|
||||||
|
/// **Note:** The caller must ensure that the two vectors have the same length
|
||||||
|
/// and are non-empty. Empty vectors return 0.0 for all metrics.
|
||||||
|
pub fn compute_distance(v1: &[f32], v2: &[f32], metric: VectorDistanceMetric) -> f32 {
|
||||||
|
// Empty vectors are degenerate; return 0.0 uniformly across all metrics.
|
||||||
|
if v1.is_empty() || v2.is_empty() {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
match metric {
|
||||||
|
VectorDistanceMetric::L2sq => l2sq(v1, v2),
|
||||||
|
VectorDistanceMetric::Cosine => cosine(v1, v2),
|
||||||
|
VectorDistanceMetric::InnerProduct => -dot(v1, v2),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculates the squared L2 distance between two vectors.
|
||||||
|
fn l2sq(lhs: &[f32], rhs: &[f32]) -> f32 {
|
||||||
|
let lhs = DVectorView::from_slice(lhs, lhs.len());
|
||||||
|
let rhs = DVectorView::from_slice(rhs, rhs.len());
|
||||||
|
(lhs - rhs).norm_squared()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculates the cosine distance between two vectors.
|
||||||
|
///
|
||||||
|
/// Returns a value in `[0.0, 2.0]` where 0.0 means identical direction and 2.0 means
|
||||||
|
/// opposite direction. For degenerate cases (zero or near-zero magnitude vectors),
|
||||||
|
/// returns 1.0 (maximum uncertainty) to avoid NaN and ensure safe index operations.
|
||||||
|
fn cosine(lhs: &[f32], rhs: &[f32]) -> f32 {
|
||||||
|
let lhs_vec = DVectorView::from_slice(lhs, lhs.len());
|
||||||
|
let rhs_vec = DVectorView::from_slice(rhs, rhs.len());
|
||||||
|
|
||||||
|
let dot_product = lhs_vec.dot(&rhs_vec);
|
||||||
|
let lhs_norm = lhs_vec.norm();
|
||||||
|
let rhs_norm = rhs_vec.norm();
|
||||||
|
|
||||||
|
// Zero-magnitude vectors have undefined direction; return max distance as safe fallback.
|
||||||
|
if dot_product.abs() < f32::EPSILON
|
||||||
|
|| lhs_norm.abs() < f32::EPSILON
|
||||||
|
|| rhs_norm.abs() < f32::EPSILON
|
||||||
|
{
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let cos_similar = dot_product / (lhs_norm * rhs_norm);
|
||||||
|
let res = 1.0 - cos_similar;
|
||||||
|
// Clamp near-zero results to exactly 0.0 to avoid floating-point artifacts.
|
||||||
|
if res.abs() < f32::EPSILON { 0.0 } else { res }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculates the dot product between two vectors.
|
||||||
|
fn dot(lhs: &[f32], rhs: &[f32]) -> f32 {
|
||||||
|
let lhs = DVectorView::from_slice(lhs, lhs.len());
|
||||||
|
let rhs = DVectorView::from_slice(rhs, rhs.len());
|
||||||
|
lhs.dot(&rhs)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distance_metric_to_usearch() {
|
||||||
|
assert_eq!(
|
||||||
|
distance_metric_to_usearch(VectorDistanceMetric::L2sq),
|
||||||
|
MetricKind::L2sq
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
distance_metric_to_usearch(VectorDistanceMetric::Cosine),
|
||||||
|
MetricKind::Cos
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
distance_metric_to_usearch(VectorDistanceMetric::InnerProduct),
|
||||||
|
MetricKind::IP
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vector_index_options_default() {
|
||||||
|
let options = VectorIndexOptions::default();
|
||||||
|
assert_eq!(options.metric, VectorDistanceMetric::L2sq);
|
||||||
|
assert_eq!(options.connectivity, 16);
|
||||||
|
assert_eq!(options.expansion_add, 128);
|
||||||
|
assert_eq!(options.expansion_search, 64);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compute_distance_l2sq() {
|
||||||
|
let v1 = vec![1.0, 2.0, 3.0];
|
||||||
|
let v2 = vec![4.0, 5.0, 6.0];
|
||||||
|
// L2sq = (4-1)^2 + (5-2)^2 + (6-3)^2 = 9 + 9 + 9 = 27
|
||||||
|
let dist = compute_distance(&v1, &v2, VectorDistanceMetric::L2sq);
|
||||||
|
assert!((dist - 27.0).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compute_distance_cosine() {
|
||||||
|
let v1 = vec![1.0, 0.0, 0.0];
|
||||||
|
let v2 = vec![0.0, 1.0, 0.0];
|
||||||
|
// Orthogonal vectors have cosine similarity of 0, distance of 1
|
||||||
|
let dist = compute_distance(&v1, &v2, VectorDistanceMetric::Cosine);
|
||||||
|
assert!((dist - 1.0).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compute_distance_inner_product() {
|
||||||
|
let v1 = vec![1.0, 2.0, 3.0];
|
||||||
|
let v2 = vec![4.0, 5.0, 6.0];
|
||||||
|
// Inner product = 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
|
||||||
|
// Distance is negated: -32
|
||||||
|
let dist = compute_distance(&v1, &v2, VectorDistanceMetric::InnerProduct);
|
||||||
|
assert!((dist - (-32.0)).abs() < 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compute_distance_empty_vectors() {
|
||||||
|
// Empty vectors should return 0.0 uniformly for all metrics
|
||||||
|
assert_eq!(compute_distance(&[], &[], VectorDistanceMetric::L2sq), 0.0);
|
||||||
|
assert_eq!(
|
||||||
|
compute_distance(&[], &[], VectorDistanceMetric::Cosine),
|
||||||
|
0.0
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
compute_distance(&[], &[], VectorDistanceMetric::InnerProduct),
|
||||||
|
0.0
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -285,6 +285,13 @@ pub enum Error {
|
|||||||
location: Location,
|
location: Location,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
#[snafu(display("Failed to set VECTOR index option"))]
|
||||||
|
SetVectorIndexOption {
|
||||||
|
source: datatypes::error::Error,
|
||||||
|
#[snafu(implicit)]
|
||||||
|
location: Location,
|
||||||
|
},
|
||||||
|
|
||||||
#[snafu(display(
|
#[snafu(display(
|
||||||
"Invalid partition number: {}, should be in range [2, 65536]",
|
"Invalid partition number: {}, should be in range [2, 65536]",
|
||||||
partition_num
|
partition_num
|
||||||
@@ -394,7 +401,9 @@ impl ErrorExt for Error {
|
|||||||
ConvertValue { .. } => StatusCode::Unsupported,
|
ConvertValue { .. } => StatusCode::Unsupported,
|
||||||
|
|
||||||
PermissionDenied { .. } => StatusCode::PermissionDenied,
|
PermissionDenied { .. } => StatusCode::PermissionDenied,
|
||||||
SetFulltextOption { .. } | SetSkippingIndexOption { .. } => StatusCode::Unexpected,
|
SetFulltextOption { .. }
|
||||||
|
| SetSkippingIndexOption { .. }
|
||||||
|
| SetVectorIndexOption { .. } => StatusCode::Unexpected,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ use crate::parser::{FLOW, ParserContext};
|
|||||||
use crate::parsers::tql_parser;
|
use crate::parsers::tql_parser;
|
||||||
use crate::parsers::utils::{
|
use crate::parsers::utils::{
|
||||||
self, validate_column_fulltext_create_option, validate_column_skipping_index_create_option,
|
self, validate_column_fulltext_create_option, validate_column_skipping_index_create_option,
|
||||||
|
validate_column_vector_index_create_option,
|
||||||
};
|
};
|
||||||
use crate::statements::create::{
|
use crate::statements::create::{
|
||||||
Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable,
|
Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable,
|
||||||
@@ -60,6 +61,7 @@ pub const EXPIRE: &str = "EXPIRE";
|
|||||||
pub const AFTER: &str = "AFTER";
|
pub const AFTER: &str = "AFTER";
|
||||||
pub const INVERTED: &str = "INVERTED";
|
pub const INVERTED: &str = "INVERTED";
|
||||||
pub const SKIPPING: &str = "SKIPPING";
|
pub const SKIPPING: &str = "SKIPPING";
|
||||||
|
pub const VECTOR: &str = "VECTOR";
|
||||||
|
|
||||||
pub type RawIntervalExpr = String;
|
pub type RawIntervalExpr = String;
|
||||||
|
|
||||||
@@ -928,6 +930,61 @@ impl<'a> ParserContext<'a> {
|
|||||||
is_index_declared |= true;
|
is_index_declared |= true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// vector index
|
||||||
|
if let Token::Word(word) = parser.peek_token().token
|
||||||
|
&& word.value.eq_ignore_ascii_case(VECTOR)
|
||||||
|
{
|
||||||
|
parser.next_token();
|
||||||
|
// Consume `INDEX` keyword
|
||||||
|
ensure!(
|
||||||
|
parser.parse_keyword(Keyword::INDEX),
|
||||||
|
InvalidColumnOptionSnafu {
|
||||||
|
name: column_name.to_string(),
|
||||||
|
msg: "expect INDEX after VECTOR keyword",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
ensure!(
|
||||||
|
column_extensions.vector_index_options.is_none(),
|
||||||
|
InvalidColumnOptionSnafu {
|
||||||
|
name: column_name.to_string(),
|
||||||
|
msg: "duplicated VECTOR INDEX option",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Check that column is a vector type
|
||||||
|
let column_type = get_unalias_type(column_type);
|
||||||
|
let data_type = sql_data_type_to_concrete_data_type(&column_type, column_extensions)?;
|
||||||
|
ensure!(
|
||||||
|
matches!(data_type, ConcreteDataType::Vector(_)),
|
||||||
|
InvalidColumnOptionSnafu {
|
||||||
|
name: column_name.to_string(),
|
||||||
|
msg: "VECTOR INDEX only supports Vector type columns",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
let options = parser
|
||||||
|
.parse_options(Keyword::WITH)
|
||||||
|
.context(error::SyntaxSnafu)?
|
||||||
|
.into_iter()
|
||||||
|
.map(parse_option_string)
|
||||||
|
.collect::<Result<Vec<_>>>()?;
|
||||||
|
|
||||||
|
for (key, _) in options.iter() {
|
||||||
|
ensure!(
|
||||||
|
validate_column_vector_index_create_option(key),
|
||||||
|
InvalidColumnOptionSnafu {
|
||||||
|
name: column_name.to_string(),
|
||||||
|
msg: format!("invalid VECTOR INDEX option: {key}"),
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let options = OptionMap::new(options);
|
||||||
|
column_extensions.vector_index_options = Some(options);
|
||||||
|
is_index_declared |= true;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(is_index_declared)
|
Ok(is_index_declared)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2714,7 +2771,8 @@ CREATE TABLE log (
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_column_extensions_vector() {
|
fn test_parse_column_extensions_vector() {
|
||||||
let sql = "VECTOR(128)";
|
// Test that vector options are parsed from data_type (no additional SQL needed)
|
||||||
|
let sql = "";
|
||||||
let dialect = GenericDialect {};
|
let dialect = GenericDialect {};
|
||||||
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||||||
let tokens = tokenizer.tokenize().unwrap();
|
let tokens = tokenizer.tokenize().unwrap();
|
||||||
@@ -2734,7 +2792,8 @@ CREATE TABLE log (
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_column_extensions_vector_invalid() {
|
fn test_parse_column_extensions_vector_invalid() {
|
||||||
let sql = "VECTOR()";
|
// Test that vector with no dimension fails
|
||||||
|
let sql = "";
|
||||||
let dialect = GenericDialect {};
|
let dialect = GenericDialect {};
|
||||||
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||||||
let tokens = tokenizer.tokenize().unwrap();
|
let tokens = tokenizer.tokenize().unwrap();
|
||||||
@@ -2912,4 +2971,174 @@ CREATE TABLE log (
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!("SELECT '10 seconds'::INTERVAL", &stmts[0].to_string());
|
assert_eq!("SELECT '10 seconds'::INTERVAL", &stmts[0].to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_create_table_vector_index_options() {
|
||||||
|
// Test basic vector index
|
||||||
|
let sql = r"
|
||||||
|
CREATE TABLE vectors (
|
||||||
|
ts TIMESTAMP TIME INDEX,
|
||||||
|
vec VECTOR(128) VECTOR INDEX,
|
||||||
|
)";
|
||||||
|
let result =
|
||||||
|
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
if let Statement::CreateTable(c) = &result[0] {
|
||||||
|
c.columns.iter().for_each(|col| {
|
||||||
|
if col.name().value == "vec" {
|
||||||
|
assert!(
|
||||||
|
col.extensions
|
||||||
|
.vector_index_options
|
||||||
|
.as_ref()
|
||||||
|
.unwrap()
|
||||||
|
.is_empty()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
panic!("should be create_table statement");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test vector index with options
|
||||||
|
let sql = r"
|
||||||
|
CREATE TABLE vectors (
|
||||||
|
ts TIMESTAMP TIME INDEX,
|
||||||
|
vec VECTOR(128) VECTOR INDEX WITH (metric='cosine', connectivity='32', expansion_add='256', expansion_search='128')
|
||||||
|
)";
|
||||||
|
let result =
|
||||||
|
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
if let Statement::CreateTable(c) = &result[0] {
|
||||||
|
c.columns.iter().for_each(|col| {
|
||||||
|
if col.name().value == "vec" {
|
||||||
|
let options = col.extensions.vector_index_options.as_ref().unwrap();
|
||||||
|
assert_eq!(options.len(), 4);
|
||||||
|
assert_eq!(options.get("metric").unwrap(), "cosine");
|
||||||
|
assert_eq!(options.get("connectivity").unwrap(), "32");
|
||||||
|
assert_eq!(options.get("expansion_add").unwrap(), "256");
|
||||||
|
assert_eq!(options.get("expansion_search").unwrap(), "128");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
panic!("should be create_table statement");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_create_table_vector_index_invalid_type() {
|
||||||
|
// Test vector index on non-vector type (should fail)
|
||||||
|
let sql = r"
|
||||||
|
CREATE TABLE vectors (
|
||||||
|
ts TIMESTAMP TIME INDEX,
|
||||||
|
col INT VECTOR INDEX,
|
||||||
|
)";
|
||||||
|
let result =
|
||||||
|
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("VECTOR INDEX only supports Vector type columns")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_create_table_vector_index_duplicate() {
|
||||||
|
// Test duplicate vector index (should fail)
|
||||||
|
let sql = r"
|
||||||
|
CREATE TABLE vectors (
|
||||||
|
ts TIMESTAMP TIME INDEX,
|
||||||
|
vec VECTOR(128) VECTOR INDEX VECTOR INDEX,
|
||||||
|
)";
|
||||||
|
let result =
|
||||||
|
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("duplicated VECTOR INDEX option")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_create_table_vector_index_invalid_option() {
|
||||||
|
// Test invalid option key (should fail)
|
||||||
|
let sql = r"
|
||||||
|
CREATE TABLE vectors (
|
||||||
|
ts TIMESTAMP TIME INDEX,
|
||||||
|
vec VECTOR(128) VECTOR INDEX WITH (metric='l2sq', invalid_option='foo')
|
||||||
|
)";
|
||||||
|
let result =
|
||||||
|
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("invalid VECTOR INDEX option")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_column_extensions_vector_index() {
|
||||||
|
// Test vector index on vector type
|
||||||
|
{
|
||||||
|
let sql = "VECTOR INDEX WITH (metric = 'l2sq')";
|
||||||
|
let dialect = GenericDialect {};
|
||||||
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||||||
|
let tokens = tokenizer.tokenize().unwrap();
|
||||||
|
let mut parser = Parser::new(&dialect).with_tokens(tokens);
|
||||||
|
let name = Ident::new("vec_col");
|
||||||
|
let data_type =
|
||||||
|
DataType::Custom(vec![Ident::new("VECTOR")].into(), vec!["128".to_string()]);
|
||||||
|
// First, parse the vector type to set vector_options
|
||||||
|
let mut extensions = ColumnExtensions {
|
||||||
|
vector_options: Some(OptionMap::from([(
|
||||||
|
VECTOR_OPT_DIM.to_string(),
|
||||||
|
"128".to_string(),
|
||||||
|
)])),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = ParserContext::parse_column_extensions(
|
||||||
|
&mut parser,
|
||||||
|
&name,
|
||||||
|
&data_type,
|
||||||
|
&mut extensions,
|
||||||
|
);
|
||||||
|
assert!(result.is_ok());
|
||||||
|
assert!(extensions.vector_index_options.is_some());
|
||||||
|
let vi_options = extensions.vector_index_options.unwrap();
|
||||||
|
assert_eq!(vi_options.get("metric"), Some("l2sq"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test vector index on non-vector type (should fail)
|
||||||
|
{
|
||||||
|
let sql = "VECTOR INDEX";
|
||||||
|
let dialect = GenericDialect {};
|
||||||
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||||||
|
let tokens = tokenizer.tokenize().unwrap();
|
||||||
|
let mut parser = Parser::new(&dialect).with_tokens(tokens);
|
||||||
|
let name = Ident::new("num_col");
|
||||||
|
let data_type = DataType::Int(None); // Non-vector type
|
||||||
|
let mut extensions = ColumnExtensions::default();
|
||||||
|
let result = ParserContext::parse_column_extensions(
|
||||||
|
&mut parser,
|
||||||
|
&name,
|
||||||
|
&data_type,
|
||||||
|
&mut extensions,
|
||||||
|
);
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("VECTOR INDEX only supports Vector type columns")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -222,6 +222,29 @@ pub fn validate_column_skipping_index_create_option(key: &str) -> bool {
|
|||||||
.contains(&key)
|
.contains(&key)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Valid options for VECTOR INDEX:
|
||||||
|
/// - engine: Vector index engine (usearch)
|
||||||
|
/// - metric: Distance metric (l2sq, cosine, inner_product)
|
||||||
|
/// - connectivity: HNSW M parameter
|
||||||
|
/// - expansion_add: ef_construction parameter
|
||||||
|
/// - expansion_search: ef_search parameter
|
||||||
|
pub const COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE: &str = "engine";
|
||||||
|
pub const COLUMN_VECTOR_INDEX_OPT_KEY_METRIC: &str = "metric";
|
||||||
|
pub const COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY: &str = "connectivity";
|
||||||
|
pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD: &str = "expansion_add";
|
||||||
|
pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH: &str = "expansion_search";
|
||||||
|
|
||||||
|
pub fn validate_column_vector_index_create_option(key: &str) -> bool {
|
||||||
|
[
|
||||||
|
COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE,
|
||||||
|
COLUMN_VECTOR_INDEX_OPT_KEY_METRIC,
|
||||||
|
COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY,
|
||||||
|
COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD,
|
||||||
|
COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH,
|
||||||
|
]
|
||||||
|
.contains(&key)
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert an [`IntervalMonthDayNano`] to a [`Duration`].
|
/// Convert an [`IntervalMonthDayNano`] to a [`Duration`].
|
||||||
#[cfg(feature = "enterprise")]
|
#[cfg(feature = "enterprise")]
|
||||||
pub fn convert_month_day_nano_to_duration(
|
pub fn convert_month_day_nano_to_duration(
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ use crate::ast::{
|
|||||||
use crate::error::{
|
use crate::error::{
|
||||||
self, ConvertToGrpcDataTypeSnafu, ConvertValueSnafu, Result,
|
self, ConvertToGrpcDataTypeSnafu, ConvertValueSnafu, Result,
|
||||||
SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetJsonStructureSettingsSnafu,
|
SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetJsonStructureSettingsSnafu,
|
||||||
SetSkippingIndexOptionSnafu, SqlCommonSnafu,
|
SetSkippingIndexOptionSnafu, SetVectorIndexOptionSnafu, SqlCommonSnafu,
|
||||||
};
|
};
|
||||||
use crate::statements::create::{Column, ColumnExtensions};
|
use crate::statements::create::{Column, ColumnExtensions};
|
||||||
pub use crate::statements::option_map::OptionMap;
|
pub use crate::statements::option_map::OptionMap;
|
||||||
@@ -147,6 +147,12 @@ pub fn column_to_schema(
|
|||||||
.context(SetSkippingIndexOptionSnafu)?;
|
.context(SetSkippingIndexOptionSnafu)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(options) = column.extensions.build_vector_index_options()? {
|
||||||
|
column_schema = column_schema
|
||||||
|
.with_vector_index_options(&options)
|
||||||
|
.context(SetVectorIndexOptionSnafu)?;
|
||||||
|
}
|
||||||
|
|
||||||
column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some());
|
column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some());
|
||||||
|
|
||||||
if matches!(column.data_type(), SqlDataType::JSON) {
|
if matches!(column.data_type(), SqlDataType::JSON) {
|
||||||
@@ -710,6 +716,7 @@ mod tests {
|
|||||||
skipping_index_options: None,
|
skipping_index_options: None,
|
||||||
inverted_index_options: None,
|
inverted_index_options: None,
|
||||||
json_datatype_options: None,
|
json_datatype_options: None,
|
||||||
|
vector_index_options: None,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -720,4 +727,82 @@ mod tests {
|
|||||||
assert_eq!(fulltext_options.analyzer, FulltextAnalyzer::English);
|
assert_eq!(fulltext_options.analyzer, FulltextAnalyzer::English);
|
||||||
assert!(fulltext_options.case_sensitive);
|
assert!(fulltext_options.case_sensitive);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_column_to_schema_with_vector_index() {
|
||||||
|
use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
|
||||||
|
|
||||||
|
// Test with custom metric and parameters
|
||||||
|
let column = Column {
|
||||||
|
column_def: ColumnDef {
|
||||||
|
name: "embedding".into(),
|
||||||
|
data_type: SqlDataType::Custom(
|
||||||
|
vec![Ident::new(VECTOR_TYPE_NAME)].into(),
|
||||||
|
vec!["128".to_string()],
|
||||||
|
),
|
||||||
|
options: vec![],
|
||||||
|
},
|
||||||
|
extensions: ColumnExtensions {
|
||||||
|
fulltext_index_options: None,
|
||||||
|
vector_options: None,
|
||||||
|
skipping_index_options: None,
|
||||||
|
inverted_index_options: None,
|
||||||
|
json_datatype_options: None,
|
||||||
|
vector_index_options: Some(OptionMap::from([
|
||||||
|
("metric".to_string(), "cosine".to_string()),
|
||||||
|
("connectivity".to_string(), "32".to_string()),
|
||||||
|
("expansion_add".to_string(), "200".to_string()),
|
||||||
|
("expansion_search".to_string(), "100".to_string()),
|
||||||
|
])),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let column_schema = column_to_schema(&column, "ts", None).unwrap();
|
||||||
|
assert_eq!("embedding", column_schema.name);
|
||||||
|
assert!(column_schema.is_vector_indexed());
|
||||||
|
|
||||||
|
let vector_options = column_schema.vector_index_options().unwrap().unwrap();
|
||||||
|
assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch);
|
||||||
|
assert_eq!(vector_options.metric, VectorDistanceMetric::Cosine);
|
||||||
|
assert_eq!(vector_options.connectivity, 32);
|
||||||
|
assert_eq!(vector_options.expansion_add, 200);
|
||||||
|
assert_eq!(vector_options.expansion_search, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_column_to_schema_with_vector_index_defaults() {
|
||||||
|
use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
|
||||||
|
|
||||||
|
// Test with default values (empty options map)
|
||||||
|
let column = Column {
|
||||||
|
column_def: ColumnDef {
|
||||||
|
name: "vec".into(),
|
||||||
|
data_type: SqlDataType::Custom(
|
||||||
|
vec![Ident::new(VECTOR_TYPE_NAME)].into(),
|
||||||
|
vec!["64".to_string()],
|
||||||
|
),
|
||||||
|
options: vec![],
|
||||||
|
},
|
||||||
|
extensions: ColumnExtensions {
|
||||||
|
fulltext_index_options: None,
|
||||||
|
vector_options: None,
|
||||||
|
skipping_index_options: None,
|
||||||
|
inverted_index_options: None,
|
||||||
|
json_datatype_options: None,
|
||||||
|
vector_index_options: Some(OptionMap::default()),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let column_schema = column_to_schema(&column, "ts", None).unwrap();
|
||||||
|
assert_eq!("vec", column_schema.name);
|
||||||
|
assert!(column_schema.is_vector_indexed());
|
||||||
|
|
||||||
|
let vector_options = column_schema.vector_index_options().unwrap().unwrap();
|
||||||
|
// Verify defaults
|
||||||
|
assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch);
|
||||||
|
assert_eq!(vector_options.metric, VectorDistanceMetric::L2sq);
|
||||||
|
assert_eq!(vector_options.connectivity, 16);
|
||||||
|
assert_eq!(vector_options.expansion_add, 128);
|
||||||
|
assert_eq!(vector_options.expansion_search, 64);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,10 @@ use std::fmt::{Display, Formatter};
|
|||||||
|
|
||||||
use common_catalog::consts::FILE_ENGINE;
|
use common_catalog::consts::FILE_ENGINE;
|
||||||
use datatypes::json::JsonStructureSettings;
|
use datatypes::json::JsonStructureSettings;
|
||||||
use datatypes::schema::{FulltextOptions, SkippingIndexOptions};
|
use datatypes::schema::{
|
||||||
|
FulltextOptions, SkippingIndexOptions, VectorDistanceMetric, VectorIndexEngineType,
|
||||||
|
VectorIndexOptions,
|
||||||
|
};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use snafu::ResultExt;
|
use snafu::ResultExt;
|
||||||
@@ -133,6 +136,8 @@ pub struct ColumnExtensions {
|
|||||||
///
|
///
|
||||||
/// Inverted index doesn't have options at present. There won't be any options in that map.
|
/// Inverted index doesn't have options at present. There won't be any options in that map.
|
||||||
pub inverted_index_options: Option<OptionMap>,
|
pub inverted_index_options: Option<OptionMap>,
|
||||||
|
/// Vector index options for HNSW-based vector similarity search.
|
||||||
|
pub vector_index_options: Option<OptionMap>,
|
||||||
pub json_datatype_options: Option<OptionMap>,
|
pub json_datatype_options: Option<OptionMap>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,6 +213,15 @@ impl Display for Column {
|
|||||||
write!(f, " INVERTED INDEX")?;
|
write!(f, " INVERTED INDEX")?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(vector_index_options) = &self.extensions.vector_index_options {
|
||||||
|
if !vector_index_options.is_empty() {
|
||||||
|
let options = vector_index_options.kv_pairs();
|
||||||
|
write!(f, " VECTOR INDEX WITH({})", format_list_comma!(options))?;
|
||||||
|
} else {
|
||||||
|
write!(f, " VECTOR INDEX")?;
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -233,6 +247,89 @@ impl ColumnExtensions {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn build_vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
|
||||||
|
let Some(options) = self.vector_index_options.as_ref() else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
|
||||||
|
let options_map: HashMap<String, String> = options.clone().into_map();
|
||||||
|
let mut result = VectorIndexOptions::default();
|
||||||
|
|
||||||
|
if let Some(s) = options_map.get("engine") {
|
||||||
|
result.engine = s.parse::<VectorIndexEngineType>().map_err(|e| {
|
||||||
|
InvalidSqlSnafu {
|
||||||
|
msg: format!("invalid VECTOR INDEX engine: {e}"),
|
||||||
|
}
|
||||||
|
.build()
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(s) = options_map.get("metric") {
|
||||||
|
result.metric = s.parse::<VectorDistanceMetric>().map_err(|e| {
|
||||||
|
InvalidSqlSnafu {
|
||||||
|
msg: format!("invalid VECTOR INDEX metric: {e}"),
|
||||||
|
}
|
||||||
|
.build()
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(s) = options_map.get("connectivity") {
|
||||||
|
let value = s.parse::<u32>().map_err(|_| {
|
||||||
|
InvalidSqlSnafu {
|
||||||
|
msg: format!(
|
||||||
|
"invalid VECTOR INDEX connectivity: {s}, expected positive integer"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
.build()
|
||||||
|
})?;
|
||||||
|
if !(2..=2048).contains(&value) {
|
||||||
|
return InvalidSqlSnafu {
|
||||||
|
msg: "VECTOR INDEX connectivity must be in the range [2, 2048].".to_string(),
|
||||||
|
}
|
||||||
|
.fail();
|
||||||
|
}
|
||||||
|
result.connectivity = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(s) = options_map.get("expansion_add") {
|
||||||
|
let value = s.parse::<u32>().map_err(|_| {
|
||||||
|
InvalidSqlSnafu {
|
||||||
|
msg: format!(
|
||||||
|
"invalid VECTOR INDEX expansion_add: {s}, expected positive integer"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
.build()
|
||||||
|
})?;
|
||||||
|
if value == 0 {
|
||||||
|
return InvalidSqlSnafu {
|
||||||
|
msg: "VECTOR INDEX expansion_add must be greater than 0".to_string(),
|
||||||
|
}
|
||||||
|
.fail();
|
||||||
|
}
|
||||||
|
result.expansion_add = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(s) = options_map.get("expansion_search") {
|
||||||
|
let value = s.parse::<u32>().map_err(|_| {
|
||||||
|
InvalidSqlSnafu {
|
||||||
|
msg: format!(
|
||||||
|
"invalid VECTOR INDEX expansion_search: {s}, expected positive integer"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
.build()
|
||||||
|
})?;
|
||||||
|
if value == 0 {
|
||||||
|
return InvalidSqlSnafu {
|
||||||
|
msg: "VECTOR INDEX expansion_search must be greater than 0".to_string(),
|
||||||
|
}
|
||||||
|
.fail();
|
||||||
|
}
|
||||||
|
result.expansion_search = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(result))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build_json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
|
pub fn build_json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
|
||||||
let Some(options) = self.json_datatype_options.as_ref() else {
|
let Some(options) = self.json_datatype_options.as_ref() else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
@@ -893,4 +990,92 @@ AS SELECT number FROM numbers_input where number > 10"#,
|
|||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_vector_index_options_validation() {
|
||||||
|
use super::{ColumnExtensions, OptionMap};
|
||||||
|
|
||||||
|
// Test zero connectivity should fail
|
||||||
|
let extensions = ColumnExtensions {
|
||||||
|
fulltext_index_options: None,
|
||||||
|
vector_options: None,
|
||||||
|
skipping_index_options: None,
|
||||||
|
inverted_index_options: None,
|
||||||
|
json_datatype_options: None,
|
||||||
|
vector_index_options: Some(OptionMap::from([(
|
||||||
|
"connectivity".to_string(),
|
||||||
|
"0".to_string(),
|
||||||
|
)])),
|
||||||
|
};
|
||||||
|
let result = extensions.build_vector_index_options();
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("connectivity must be in the range [2, 2048]")
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test zero expansion_add should fail
|
||||||
|
let extensions = ColumnExtensions {
|
||||||
|
fulltext_index_options: None,
|
||||||
|
vector_options: None,
|
||||||
|
skipping_index_options: None,
|
||||||
|
inverted_index_options: None,
|
||||||
|
json_datatype_options: None,
|
||||||
|
vector_index_options: Some(OptionMap::from([(
|
||||||
|
"expansion_add".to_string(),
|
||||||
|
"0".to_string(),
|
||||||
|
)])),
|
||||||
|
};
|
||||||
|
let result = extensions.build_vector_index_options();
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("expansion_add must be greater than 0")
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test zero expansion_search should fail
|
||||||
|
let extensions = ColumnExtensions {
|
||||||
|
fulltext_index_options: None,
|
||||||
|
vector_options: None,
|
||||||
|
skipping_index_options: None,
|
||||||
|
inverted_index_options: None,
|
||||||
|
json_datatype_options: None,
|
||||||
|
vector_index_options: Some(OptionMap::from([(
|
||||||
|
"expansion_search".to_string(),
|
||||||
|
"0".to_string(),
|
||||||
|
)])),
|
||||||
|
};
|
||||||
|
let result = extensions.build_vector_index_options();
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(
|
||||||
|
result
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("expansion_search must be greater than 0")
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test valid values should succeed
|
||||||
|
let extensions = ColumnExtensions {
|
||||||
|
fulltext_index_options: None,
|
||||||
|
vector_options: None,
|
||||||
|
skipping_index_options: None,
|
||||||
|
inverted_index_options: None,
|
||||||
|
json_datatype_options: None,
|
||||||
|
vector_index_options: Some(OptionMap::from([
|
||||||
|
("connectivity".to_string(), "32".to_string()),
|
||||||
|
("expansion_add".to_string(), "200".to_string()),
|
||||||
|
("expansion_search".to_string(), "100".to_string()),
|
||||||
|
])),
|
||||||
|
};
|
||||||
|
let result = extensions.build_vector_index_options();
|
||||||
|
assert!(result.is_ok());
|
||||||
|
let options = result.unwrap().unwrap();
|
||||||
|
assert_eq!(options.connectivity, 32);
|
||||||
|
assert_eq!(options.expansion_add, 200);
|
||||||
|
assert_eq!(options.expansion_search, 100);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,5 +27,8 @@ pub use datatypes::schema::{
|
|||||||
|
|
||||||
pub use self::descriptors::*;
|
pub use self::descriptors::*;
|
||||||
pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, IndexVersion, ParseIdError};
|
pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, IndexVersion, ParseIdError};
|
||||||
pub use self::requests::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
|
pub use self::requests::{
|
||||||
|
ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector, VectorDistanceMetric,
|
||||||
|
VectorIndexEngine, VectorIndexEngineType, VectorSearchMatches, VectorSearchRequest,
|
||||||
|
};
|
||||||
pub use self::types::{SequenceNumber, SequenceRange};
|
pub use self::types::{SequenceNumber, SequenceRange};
|
||||||
|
|||||||
@@ -14,11 +14,66 @@
|
|||||||
|
|
||||||
use std::fmt::{Display, Formatter};
|
use std::fmt::{Display, Formatter};
|
||||||
|
|
||||||
|
use common_error::ext::BoxedError;
|
||||||
use common_recordbatch::OrderOption;
|
use common_recordbatch::OrderOption;
|
||||||
use datafusion_expr::expr::Expr;
|
use datafusion_expr::expr::Expr;
|
||||||
|
// Re-export vector types from datatypes to avoid duplication
|
||||||
|
pub use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
|
||||||
use strum::Display;
|
use strum::Display;
|
||||||
|
|
||||||
use crate::storage::SequenceNumber;
|
use crate::storage::{ColumnId, SequenceNumber};
|
||||||
|
|
||||||
|
/// A hint for KNN vector search.
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub struct VectorSearchRequest {
|
||||||
|
/// Column ID of the vector column to search.
|
||||||
|
pub column_id: ColumnId,
|
||||||
|
/// The query vector to search for.
|
||||||
|
pub query_vector: Vec<f32>,
|
||||||
|
/// Number of nearest neighbors to return.
|
||||||
|
pub k: usize,
|
||||||
|
/// Distance metric to use (matches the index metric).
|
||||||
|
pub metric: VectorDistanceMetric,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search results from vector index.
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub struct VectorSearchMatches {
|
||||||
|
/// Keys (row offsets in the index).
|
||||||
|
pub keys: Vec<u64>,
|
||||||
|
/// Distances from the query vector.
|
||||||
|
pub distances: Vec<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Trait for vector index engines (HNSW implementations).
|
||||||
|
///
|
||||||
|
/// This trait defines the interface for pluggable vector index engines.
|
||||||
|
/// Implementations (e.g., UsearchEngine) are provided by storage engines like mito2.
|
||||||
|
pub trait VectorIndexEngine: Send + Sync {
|
||||||
|
/// Adds a vector with the given key.
|
||||||
|
fn add(&mut self, key: u64, vector: &[f32]) -> Result<(), BoxedError>;
|
||||||
|
|
||||||
|
/// Searches for k nearest neighbors.
|
||||||
|
fn search(&self, query: &[f32], k: usize) -> Result<VectorSearchMatches, BoxedError>;
|
||||||
|
|
||||||
|
/// Returns the serialized length.
|
||||||
|
fn serialized_length(&self) -> usize;
|
||||||
|
|
||||||
|
/// Serializes the index to a buffer.
|
||||||
|
fn save_to_buffer(&self, buffer: &mut [u8]) -> Result<(), BoxedError>;
|
||||||
|
|
||||||
|
/// Reserves capacity for vectors.
|
||||||
|
fn reserve(&mut self, capacity: usize) -> Result<(), BoxedError>;
|
||||||
|
|
||||||
|
/// Returns current size (number of vectors).
|
||||||
|
fn size(&self) -> usize;
|
||||||
|
|
||||||
|
/// Returns current capacity.
|
||||||
|
fn capacity(&self) -> usize;
|
||||||
|
|
||||||
|
/// Returns memory usage in bytes.
|
||||||
|
fn memory_usage(&self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
/// A hint on how to select rows from a time-series.
|
/// A hint on how to select rows from a time-series.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
|
||||||
@@ -38,7 +93,7 @@ pub enum TimeSeriesDistribution {
|
|||||||
PerSeries,
|
PerSeries,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
#[derive(Default, Clone, Debug, PartialEq)]
|
||||||
pub struct ScanRequest {
|
pub struct ScanRequest {
|
||||||
/// Indices of columns to read, `None` to read all columns. This indices is
|
/// Indices of columns to read, `None` to read all columns. This indices is
|
||||||
/// based on table schema.
|
/// based on table schema.
|
||||||
@@ -66,6 +121,9 @@ pub struct ScanRequest {
|
|||||||
pub sst_min_sequence: Option<SequenceNumber>,
|
pub sst_min_sequence: Option<SequenceNumber>,
|
||||||
/// Optional hint for the distribution of time-series data.
|
/// Optional hint for the distribution of time-series data.
|
||||||
pub distribution: Option<TimeSeriesDistribution>,
|
pub distribution: Option<TimeSeriesDistribution>,
|
||||||
|
/// Optional hint for KNN vector search. When set, the scan should use
|
||||||
|
/// vector index to find the k nearest neighbors.
|
||||||
|
pub vector_search: Option<VectorSearchRequest>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Display for ScanRequest {
|
impl Display for ScanRequest {
|
||||||
@@ -138,6 +196,16 @@ impl Display for ScanRequest {
|
|||||||
if let Some(distribution) = &self.distribution {
|
if let Some(distribution) = &self.distribution {
|
||||||
write!(f, "{}distribution: {}", delimiter.as_str(), distribution)?;
|
write!(f, "{}distribution: {}", delimiter.as_str(), distribution)?;
|
||||||
}
|
}
|
||||||
|
if let Some(vector_search) = &self.vector_search {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{}vector_search: column_id={}, k={}, metric={}",
|
||||||
|
delimiter.as_str(),
|
||||||
|
vector_search.column_id,
|
||||||
|
vector_search.k,
|
||||||
|
vector_search.metric
|
||||||
|
)?;
|
||||||
|
}
|
||||||
write!(f, " }}")
|
write!(f, " }}")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user