From c08f3a4472b9b5bd728f5b73f7a03b14da4ebda7 Mon Sep 17 00:00:00 2001 From: dennis zhuang Date: Wed, 4 Feb 2026 11:54:47 +0800 Subject: [PATCH] test: adds sqlness test for vector index (#7634) * test: adds sqlness test for vector index Signed-off-by: Dennis Zhuang * fix: CI Signed-off-by: Dennis Zhuang * test: redacted flat map and size Signed-off-by: Dennis Zhuang * test: simplify the replace rules Signed-off-by: Dennis Zhuang * chore: update comments and tests Signed-off-by: Dennis Zhuang --------- Signed-off-by: Dennis Zhuang --- .github/workflows/develop.yml | 2 +- src/mito2/src/read/scan_region.rs | 4 + src/query/src/optimizer/scan_hint.rs | 55 +- .../src/optimizer/scan_hint/vector_search.rs | 86 ++- .../function/vector/vector_index.result | 602 ++++++++++++++++++ .../common/function/vector/vector_index.sql | 341 ++++++++++ .../vector/vector_index_explain.result | 270 ++++++++ .../function/vector/vector_index_explain.sql | 155 +++++ .../function/vector/vector_index_join.result | 240 +++++++ .../function/vector/vector_index_join.sql | 159 +++++ tests/runner/src/env/bare.rs | 2 +- 11 files changed, 1901 insertions(+), 15 deletions(-) create mode 100644 tests/cases/standalone/common/function/vector/vector_index.result create mode 100644 tests/cases/standalone/common/function/vector/vector_index.sql create mode 100644 tests/cases/standalone/common/function/vector/vector_index_explain.result create mode 100644 tests/cases/standalone/common/function/vector/vector_index_explain.sql create mode 100644 tests/cases/standalone/common/function/vector/vector_index_join.result create mode 100644 tests/cases/standalone/common/function/vector/vector_index_join.sql diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 1b35761318..8b39acd99b 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -119,7 +119,7 @@ jobs: - name: Build greptime binaries shell: bash # `cargo gc` will invoke `cargo build` with specified args - run: cargo gc -- --bin greptime --bin sqlness-runner --features "pg_kvbackend,mysql_kvbackend" + run: cargo gc -- --bin greptime --bin sqlness-runner --features "pg_kvbackend,mysql_kvbackend,vector_index" - name: Pack greptime binaries shell: bash run: | diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 86d73991b0..bc8bb987d7 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -1576,6 +1576,10 @@ impl StreamContext { let exprs: Vec<_> = predicate.exprs().iter().map(|e| e.to_string()).collect(); write!(f, ", \"filters\": {:?}", exprs)?; } + #[cfg(feature = "vector_index")] + if let Some(vector_index_k) = self.input.vector_index_k { + write!(f, ", \"vector_index_k\": {}", vector_index_k)?; + } if !self.input.files.is_empty() { write!(f, ", \"files\": ")?; f.debug_list() diff --git a/src/query/src/optimizer/scan_hint.rs b/src/query/src/optimizer/scan_hint.rs index c06c6b7812..da70813404 100644 --- a/src/query/src/optimizer/scan_hint.rs +++ b/src/query/src/optimizer/scan_hint.rs @@ -323,14 +323,18 @@ impl TreeNodeVisitor<'_> for ScanHintVisitor { } // Avoid carrying vector hints across branching inputs (join/subquery) to prevent - // pruning results before global ordering is applied. - let is_branching = matches!(node, LogicalPlan::Subquery(_)) || node.inputs().len() > 1; - if is_branching && self.ts_row_selector.is_some() { + // pruning results before global ordering is applied. Only treat a subquery as a + // barrier when it contains non-inlineable operators. + let is_branching_for_ts = matches!( + node, + LogicalPlan::Subquery(_) | LogicalPlan::SubqueryAlias(_) + ) || node.inputs().len() > 1; + if is_branching_for_ts && self.ts_row_selector.is_some() { // clean previous time series selector hint when encounter subqueries or join self.ts_row_selector = None; } #[cfg(feature = "vector_index")] - if is_branching { + if is_branching_for_vector(node) { self.vector_search.on_branching_enter(); } @@ -371,8 +375,10 @@ impl TreeNodeVisitor<'_> for ScanHintVisitor { LogicalPlan::Filter(_) => { self.vector_search.on_filter_exit(); } - LogicalPlan::Subquery(_) => { - self.vector_search.on_branching_exit(); + LogicalPlan::Subquery(_) | LogicalPlan::SubqueryAlias(_) => { + if is_branching_for_vector(_node) { + self.vector_search.on_branching_exit(); + } } _ if _node.inputs().len() > 1 => { self.vector_search.on_branching_exit(); @@ -398,6 +404,43 @@ impl ScanHintVisitor { } } +#[cfg(feature = "vector_index")] +fn is_branching_for_vector(node: &LogicalPlan) -> bool { + if node.inputs().len() > 1 { + return true; + } + + match node { + LogicalPlan::Subquery(subquery) => has_non_inlineable_ops(subquery.subquery.as_ref()), + LogicalPlan::SubqueryAlias(alias) => has_non_inlineable_ops(alias.input.as_ref()), + _ => false, + } +} + +#[cfg(feature = "vector_index")] +fn has_non_inlineable_ops(plan: &LogicalPlan) -> bool { + if matches!( + plan, + LogicalPlan::Limit(_) + | LogicalPlan::Sort(_) + | LogicalPlan::Distinct(_) + | LogicalPlan::Aggregate(_) + | LogicalPlan::Window(_) + | LogicalPlan::Union(_) + | LogicalPlan::Join(_) + ) { + return true; + } + + for input in plan.inputs() { + if has_non_inlineable_ops(input) { + return true; + } + } + + false +} + #[cfg(test)] mod test { use std::sync::Arc; diff --git a/src/query/src/optimizer/scan_hint/vector_search.rs b/src/query/src/optimizer/scan_hint/vector_search.rs index 44f76066a1..7f97ad5ac1 100644 --- a/src/query/src/optimizer/scan_hint/vector_search.rs +++ b/src/query/src/optimizer/scan_hint/vector_search.rs @@ -35,6 +35,16 @@ use crate::dummy_catalog::DummyTableProvider; /// - A LIMIT (or Sort.fetch) is present to derive k. /// - The hint stays within a single input chain (not across join/subquery branches). /// - The target column is non-nullable, or an explicit IS NOT NULL filter exists. +/// +/// Known limitations: +/// - Dynamic overfetching is not implemented yet. When filters exist or ORDER BY includes +/// additional tie-breaker columns (e.g., ORDER BY distance, id), the current fixed k may +/// return incorrect results. A future improvement should dynamically adjust k based on +/// filter selectivity and secondary sort requirements. +/// - Hints only block at subquery boundaries when the subquery contains non-inlineable +/// operators (Limit/Sort/Distinct/Aggregate/Window). Simple subqueries without these +/// operators allow hints to propagate through. In distributed mode, the dist analyzer +/// may inline subqueries before this rule runs, further reducing isolation. #[derive(Default)] pub(crate) struct VectorSearchState { current_distance: Option, @@ -241,23 +251,47 @@ impl VectorSearchState { fn extract_distance_from_sort( sort: &datafusion_expr::logical_plan::Sort, ) -> Option { - if sort.expr.len() != 1 { - debug!( - "Skip vector hint: Sort has {} expressions, expected 1", - sort.expr.len() - ); + if sort.expr.is_empty() { + debug!("Skip vector hint: Sort has no expressions"); return None; } let sort_expr: &SortExpr = &sort.expr[0]; let info = Self::extract_distance_info(&sort_expr.expr)?; let expected_asc = info.metric != VectorDistanceMetric::InnerProduct; - if sort_expr.asc == expected_asc { + if sort_expr.asc != expected_asc { + return None; + } + + if Self::tie_breakers_allowed(&sort.expr[1..], &info) { Some(info) } else { + if sort.expr.len() > 1 { + debug!( + "Skip vector hint: Sort has unsupported tie-breakers ({} expressions)", + sort.expr.len() + ); + } None } } + fn tie_breakers_allowed(sort_exprs: &[SortExpr], distance_info: &VectorDistanceInfo) -> bool { + if sort_exprs.is_empty() { + return true; + } + + sort_exprs.iter().all(|sort_expr| { + let Expr::Column(col) = &sort_expr.expr else { + return false; + }; + + match &distance_info.table_reference { + Some(table) => col.relation.as_ref() == Some(table), + None => col.relation.is_none(), + } + }) + } + fn extract_limit_info(limit: &datafusion_expr::logical_plan::Limit) -> Option { let fetch = match limit.get_fetch_type().ok()? { FetchType::Literal(fetch) => fetch?, @@ -767,8 +801,10 @@ mod tests { assert!(t2_provider.get_vector_search_hint().is_none()); } + // Simple subqueries (without non-inlineable ops like Limit/Sort/Distinct/Aggregate/Window) + // allow hints to propagate through. See known limitations in VectorSearchState docs. #[test] - fn test_no_vector_hint_above_subquery() { + fn test_simple_subquery_allows_hint_propagation() { let provider = build_dummy_provider(10); let table_source = Arc::new(DefaultTableSource::new(provider.clone())); let scan_plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![]) @@ -794,6 +830,42 @@ mod tests { let context = OptimizerContext::default(); let _ = ScanHintRule.rewrite(plan, &context).unwrap(); + // Hint propagates through simple subquery + let hint = provider.get_vector_search_hint().unwrap(); + assert_eq!(hint.k, 5); + } + + // Subqueries with non-inlineable ops (Limit/Sort/Distinct/Aggregate/Window) block hint propagation. + #[test] + fn test_subquery_with_limit_blocks_hint() { + let provider = build_dummy_provider(10); + let table_source = Arc::new(DefaultTableSource::new(provider.clone())); + let scan_plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![]) + .unwrap() + .limit(0, Some(100)) // non-inlineable op inside subquery + .unwrap() + .build() + .unwrap(); + + let subquery = LogicalPlan::Subquery(Subquery { + subquery: Arc::new(scan_plan), + outer_ref_columns: vec![], + spans: Default::default(), + }); + + let expr = vec_distance_expr(VEC_L2SQ_DISTANCE); + let plan = LogicalPlanBuilder::from(subquery) + .sort(vec![expr.sort(true, false)]) + .unwrap() + .limit(0, Some(5)) + .unwrap() + .build() + .unwrap(); + + let context = OptimizerContext::default(); + let _ = ScanHintRule.rewrite(plan, &context).unwrap(); + + // Hint does NOT propagate through subquery with non-inlineable ops assert!(provider.get_vector_search_hint().is_none()); } diff --git a/tests/cases/standalone/common/function/vector/vector_index.result b/tests/cases/standalone/common/function/vector/vector_index.result new file mode 100644 index 0000000000..ace2205f23 --- /dev/null +++ b/tests/cases/standalone/common/function/vector/vector_index.result @@ -0,0 +1,602 @@ +-- Test vector index creation and KNN search +-- ============================================ +-- Part 1: Basic L2 squared distance tests +-- ============================================ +-- Create a table with vector column and L2sq vector index +CREATE TABLE vectors_l2sq ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +-- Insert test vectors +INSERT INTO vectors_l2sq VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.0, 0.0, 1.0]'), + (5, '2024-01-01 00:00:04', '[1.0, 1.0, 0.0, 0.0]'), + (6, '2024-01-01 00:00:05', '[0.0, 1.0, 1.0, 0.0]'), + (7, '2024-01-01 00:00:06', '[0.0, 0.0, 1.0, 1.0]'), + (8, '2024-01-01 00:00:07', '[1.0, 0.0, 0.0, 1.0]'); + +Affected Rows: 8 + +-- Query BEFORE flush (memtable search) +-- Expected: vec_id=1 (distance=0), vec_id=5 (distance=1), vec_id=8 (distance=1) +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_l2sq +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+----------+ +| vec_id | distance | ++--------+----------+ +| 1 | 0.0 | +| 5 | 1.0 | +| 8 | 1.0 | ++--------+----------+ + +-- Flush to create SST files with vector index +ADMIN FLUSH_TABLE('vectors_l2sq'); + ++-----------------------------------+ +| ADMIN FLUSH_TABLE('vectors_l2sq') | ++-----------------------------------+ +| 0 | ++-----------------------------------+ + +-- Query AFTER flush (SST index search) +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_l2sq +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+----------+ +| vec_id | distance | ++--------+----------+ +| 1 | 0.0 | +| 5 | 1.0 | +| 8 | 1.0 | ++--------+----------+ + +-- Query with different target vector +-- Expected: vec_id=6 (distance=0), vec_id=2 (distance=1), vec_id=3 (distance=1) +SELECT vec_id, vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]') as distance +FROM vectors_l2sq +ORDER BY vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]'), vec_id +LIMIT 3; + ++--------+----------+ +| vec_id | distance | ++--------+----------+ +| 6 | 0.0 | +| 2 | 1.0 | +| 3 | 1.0 | ++--------+----------+ + +DROP TABLE vectors_l2sq; + +Affected Rows: 0 + +-- ============================================ +-- Part 2: Cosine distance tests +-- ============================================ +CREATE TABLE vectors_cosine ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'cosine'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +-- Insert vectors with different magnitudes but same/different directions +INSERT INTO vectors_cosine VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]'); + +Affected Rows: 5 + +-- Memtable search with cosine distance +-- vec_id=1 and vec_id=2 should have same cosine distance (0) since they point same direction +SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_cosine +ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+------------+ +| vec_id | distance | ++--------+------------+ +| 1 | 0.0 | +| 2 | 0.0 | +| 4 | 0.29289323 | ++--------+------------+ + +ADMIN FLUSH_TABLE('vectors_cosine'); + ++-------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_cosine') | ++-------------------------------------+ +| 0 | ++-------------------------------------+ + +-- SST index search with cosine distance +SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_cosine +ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+------------+ +| vec_id | distance | ++--------+------------+ +| 1 | 0.0 | +| 2 | 0.0 | +| 4 | 0.29289323 | ++--------+------------+ + +DROP TABLE vectors_cosine; + +Affected Rows: 0 + +-- ============================================ +-- Part 3: Dot product (inner product) tests +-- ============================================ +CREATE TABLE vectors_dot ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'dot'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_dot VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]'); + +Affected Rows: 5 + +-- Memtable search with dot product +-- Larger dot product means more similar, so we use negative for ordering +-- vec_id=2 should be best (dot=2), vec_id=1 and vec_id=4 have dot=1 +SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product +FROM vectors_dot +ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id +LIMIT 3; + ++--------+-------------+ +| vec_id | dot_product | ++--------+-------------+ +| 2 | 2.0 | +| 1 | 1.0 | +| 4 | 1.0 | ++--------+-------------+ + +ADMIN FLUSH_TABLE('vectors_dot'); + ++----------------------------------+ +| ADMIN FLUSH_TABLE('vectors_dot') | ++----------------------------------+ +| 0 | ++----------------------------------+ + +-- SST index search with dot product +SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product +FROM vectors_dot +ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id +LIMIT 3; + ++--------+-------------+ +| vec_id | dot_product | ++--------+-------------+ +| 2 | 2.0 | +| 1 | 1.0 | +| 4 | 1.0 | ++--------+-------------+ + +DROP TABLE vectors_dot; + +Affected Rows: 0 + +-- ============================================ +-- Part 4: NULL vector handling tests +-- ============================================ +CREATE TABLE vectors_null ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +-- Insert vectors with some NULLs +INSERT INTO vectors_null VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', NULL), + (3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', NULL), + (5, '2024-01-01 00:00:04', '[0.0, 0.0, 1.0, 0.0]'); + +Affected Rows: 5 + +-- Memtable search should skip NULL vectors +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_null +WHERE embedding IS NOT NULL +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+----------+ +| vec_id | distance | ++--------+----------+ +| 1 | 0.0 | +| 3 | 2.0 | +| 5 | 2.0 | ++--------+----------+ + +ADMIN FLUSH_TABLE('vectors_null'); + ++-----------------------------------+ +| ADMIN FLUSH_TABLE('vectors_null') | ++-----------------------------------+ +| 0 | ++-----------------------------------+ + +-- SST index search should also skip NULL vectors +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_null +WHERE embedding IS NOT NULL +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+----------+ +| vec_id | distance | ++--------+----------+ +| 1 | 0.0 | +| 3 | 2.0 | +| 5 | 2.0 | ++--------+----------+ + +DROP TABLE vectors_null; + +Affected Rows: 0 + +-- ============================================ +-- Part 5: Mixed memtable + SST search tests +-- ============================================ +CREATE TABLE vectors_mixed ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +-- Insert first batch and flush to SST +INSERT INTO vectors_mixed VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]'); + +Affected Rows: 3 + +ADMIN FLUSH_TABLE('vectors_mixed'); + ++------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_mixed') | ++------------------------------------+ +| 0 | ++------------------------------------+ + +-- Insert second batch (stays in memtable) +INSERT INTO vectors_mixed VALUES + (4, '2024-01-01 00:00:03', '[0.5, 0.5, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[0.9, 0.1, 0.0, 0.0]'); + +Affected Rows: 2 + +-- Query should search both SST (vec_id 1,2,3) and memtable (vec_id 4,5) +-- Target: [1.0, 0.0, 0.0, 0.0] +-- Expected: vec_id=1 (dist=0), vec_id=5 (dist=0.02), vec_id=4 (dist=0.5) +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_mixed +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + ++--------+-------------+ +| vec_id | distance | ++--------+-------------+ +| 1 | 0.0 | +| 5 | 0.020000005 | +| 4 | 0.5 | ++--------+-------------+ + +DROP TABLE vectors_mixed; + +Affected Rows: 0 + +-- ============================================ +-- Part 6: KNN with WHERE clause tests +-- ============================================ +CREATE TABLE vectors_filter ( + vec_id INT, + category STRING, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_filter VALUES + (1, 'A', '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, 'B', '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'), + (3, 'A', '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, 'B', '2024-01-01 00:00:03', '[0.1, 0.9, 0.0, 0.0]'), + (5, 'A', '2024-01-01 00:00:04', '[0.5, 0.5, 0.0, 0.0]'); + +Affected Rows: 5 + +-- Memtable search with filter +SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_filter +WHERE category = 'A' +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+----------+----------+ +| vec_id | category | distance | ++--------+----------+----------+ +| 1 | A | 0.0 | +| 5 | A | 0.5 | ++--------+----------+----------+ + +ADMIN FLUSH_TABLE('vectors_filter'); + ++-------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_filter') | ++-------------------------------------+ +| 0 | ++-------------------------------------+ + +-- SST index search with filter +SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_filter +WHERE category = 'A' +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+----------+----------+ +| vec_id | category | distance | ++--------+----------+----------+ +| 1 | A | 0.0 | +| 5 | A | 0.5 | ++--------+----------+----------+ + +-- Filter with time range +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_filter +WHERE ts >= '2024-01-01 00:00:02' AND ts <= '2024-01-01 00:00:04' +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+-----------+ +| vec_id | distance | ++--------+-----------+ +| 5 | 0.5 | +| 4 | 1.6199999 | ++--------+-----------+ + +DROP TABLE vectors_filter; + +Affected Rows: 0 + +-- ============================================ +-- Part 7: Higher dimension vectors +-- ============================================ +CREATE TABLE vectors_high_dim ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(128) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +-- Insert high-dimensional vectors (simplified: first few elements differ) +INSERT INTO vectors_high_dim VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'); + +Affected Rows: 3 + +-- Memtable search +SELECT vec_id FROM vectors_high_dim +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+ +| vec_id | ++--------+ +| 1 | +| 3 | ++--------+ + +ADMIN FLUSH_TABLE('vectors_high_dim'); + ++---------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_high_dim') | ++---------------------------------------+ +| 0 | ++---------------------------------------+ + +-- SST index search +SELECT vec_id FROM vectors_high_dim +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+ +| vec_id | ++--------+ +| 1 | +| 3 | ++--------+ + +DROP TABLE vectors_high_dim; + +Affected Rows: 0 + +-- ============================================ +-- Part 8: Different k values (LIMIT) +-- ============================================ +CREATE TABLE vectors_k ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_k VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.8, 0.2, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[0.7, 0.3, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[0.6, 0.4, 0.0, 0.0]'), + (6, '2024-01-01 00:00:05', '[0.5, 0.5, 0.0, 0.0]'), + (7, '2024-01-01 00:00:06', '[0.4, 0.6, 0.0, 0.0]'), + (8, '2024-01-01 00:00:07', '[0.3, 0.7, 0.0, 0.0]'), + (9, '2024-01-01 00:00:08', '[0.2, 0.8, 0.0, 0.0]'), + (10, '2024-01-01 00:00:09', '[0.1, 0.9, 0.0, 0.0]'); + +Affected Rows: 10 + +ADMIN FLUSH_TABLE('vectors_k'); + ++--------------------------------+ +| ADMIN FLUSH_TABLE('vectors_k') | ++--------------------------------+ +| 0 | ++--------------------------------+ + +-- k=1 +SELECT vec_id FROM vectors_k +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 1; + ++--------+ +| vec_id | ++--------+ +| 1 | ++--------+ + +-- k=5 +SELECT vec_id FROM vectors_k +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 5; + ++--------+ +| vec_id | ++--------+ +| 1 | +| 2 | +| 3 | +| 4 | +| 5 | ++--------+ + +-- k=10 (all vectors) +SELECT vec_id FROM vectors_k +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 10; + ++--------+ +| vec_id | ++--------+ +| 1 | +| 2 | +| 3 | +| 4 | +| 5 | +| 6 | +| 7 | +| 8 | +| 9 | +| 10 | ++--------+ + +DROP TABLE vectors_k; + +Affected Rows: 0 + +-- ============================================ +-- Part 9: Engine parameter tests +-- ============================================ +-- Create table with explicit engine parameter (usearch is default) +CREATE TABLE vectors_engine ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (engine = 'usearch', metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +-- Insert test vectors +INSERT INTO vectors_engine VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0]'); + +Affected Rows: 3 + +-- Memtable search +SELECT vec_id FROM vectors_engine +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+ +| vec_id | ++--------+ +| 1 | +| 3 | ++--------+ + +ADMIN FLUSH_TABLE('vectors_engine'); + ++-------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_engine') | ++-------------------------------------+ +| 0 | ++-------------------------------------+ + +-- SST index search with usearch engine +SELECT vec_id FROM vectors_engine +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + ++--------+ +| vec_id | ++--------+ +| 1 | +| 3 | ++--------+ + +DROP TABLE vectors_engine; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/vector/vector_index.sql b/tests/cases/standalone/common/function/vector/vector_index.sql new file mode 100644 index 0000000000..ecfcd586b4 --- /dev/null +++ b/tests/cases/standalone/common/function/vector/vector_index.sql @@ -0,0 +1,341 @@ +-- Test vector index creation and KNN search + +-- ============================================ +-- Part 1: Basic L2 squared distance tests +-- ============================================ + +-- Create a table with vector column and L2sq vector index +CREATE TABLE vectors_l2sq ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +-- Insert test vectors +INSERT INTO vectors_l2sq VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.0, 0.0, 1.0]'), + (5, '2024-01-01 00:00:04', '[1.0, 1.0, 0.0, 0.0]'), + (6, '2024-01-01 00:00:05', '[0.0, 1.0, 1.0, 0.0]'), + (7, '2024-01-01 00:00:06', '[0.0, 0.0, 1.0, 1.0]'), + (8, '2024-01-01 00:00:07', '[1.0, 0.0, 0.0, 1.0]'); + +-- Query BEFORE flush (memtable search) +-- Expected: vec_id=1 (distance=0), vec_id=5 (distance=1), vec_id=8 (distance=1) +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_l2sq +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +-- Flush to create SST files with vector index +ADMIN FLUSH_TABLE('vectors_l2sq'); + +-- Query AFTER flush (SST index search) +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_l2sq +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +-- Query with different target vector +-- Expected: vec_id=6 (distance=0), vec_id=2 (distance=1), vec_id=3 (distance=1) +SELECT vec_id, vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]') as distance +FROM vectors_l2sq +ORDER BY vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]'), vec_id +LIMIT 3; + +DROP TABLE vectors_l2sq; + +-- ============================================ +-- Part 2: Cosine distance tests +-- ============================================ + +CREATE TABLE vectors_cosine ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'cosine'), + PRIMARY KEY (vec_id) +); + +-- Insert vectors with different magnitudes but same/different directions +INSERT INTO vectors_cosine VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]'); + +-- Memtable search with cosine distance +-- vec_id=1 and vec_id=2 should have same cosine distance (0) since they point same direction +SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_cosine +ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +ADMIN FLUSH_TABLE('vectors_cosine'); + +-- SST index search with cosine distance +SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_cosine +ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +DROP TABLE vectors_cosine; + +-- ============================================ +-- Part 3: Dot product (inner product) tests +-- ============================================ + +CREATE TABLE vectors_dot ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'dot'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_dot VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]'); + +-- Memtable search with dot product +-- Larger dot product means more similar, so we use negative for ordering +-- vec_id=2 should be best (dot=2), vec_id=1 and vec_id=4 have dot=1 +SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product +FROM vectors_dot +ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id +LIMIT 3; + +ADMIN FLUSH_TABLE('vectors_dot'); + +-- SST index search with dot product +SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product +FROM vectors_dot +ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id +LIMIT 3; + +DROP TABLE vectors_dot; + +-- ============================================ +-- Part 4: NULL vector handling tests +-- ============================================ + +CREATE TABLE vectors_null ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +-- Insert vectors with some NULLs +INSERT INTO vectors_null VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', NULL), + (3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', NULL), + (5, '2024-01-01 00:00:04', '[0.0, 0.0, 1.0, 0.0]'); + +-- Memtable search should skip NULL vectors +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_null +WHERE embedding IS NOT NULL +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +ADMIN FLUSH_TABLE('vectors_null'); + +-- SST index search should also skip NULL vectors +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_null +WHERE embedding IS NOT NULL +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +DROP TABLE vectors_null; + +-- ============================================ +-- Part 5: Mixed memtable + SST search tests +-- ============================================ + +CREATE TABLE vectors_mixed ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +-- Insert first batch and flush to SST +INSERT INTO vectors_mixed VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]'); + +ADMIN FLUSH_TABLE('vectors_mixed'); + +-- Insert second batch (stays in memtable) +INSERT INTO vectors_mixed VALUES + (4, '2024-01-01 00:00:03', '[0.5, 0.5, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[0.9, 0.1, 0.0, 0.0]'); + +-- Query should search both SST (vec_id 1,2,3) and memtable (vec_id 4,5) +-- Target: [1.0, 0.0, 0.0, 0.0] +-- Expected: vec_id=1 (dist=0), vec_id=5 (dist=0.02), vec_id=4 (dist=0.5) +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_mixed +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 3; + +DROP TABLE vectors_mixed; + +-- ============================================ +-- Part 6: KNN with WHERE clause tests +-- ============================================ + +CREATE TABLE vectors_filter ( + vec_id INT, + category STRING, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_filter VALUES + (1, 'A', '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, 'B', '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'), + (3, 'A', '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'), + (4, 'B', '2024-01-01 00:00:03', '[0.1, 0.9, 0.0, 0.0]'), + (5, 'A', '2024-01-01 00:00:04', '[0.5, 0.5, 0.0, 0.0]'); + +-- Memtable search with filter +SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_filter +WHERE category = 'A' +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +ADMIN FLUSH_TABLE('vectors_filter'); + +-- SST index search with filter +SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_filter +WHERE category = 'A' +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +-- Filter with time range +SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance +FROM vectors_filter +WHERE ts >= '2024-01-01 00:00:02' AND ts <= '2024-01-01 00:00:04' +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +DROP TABLE vectors_filter; + +-- ============================================ +-- Part 7: Higher dimension vectors +-- ============================================ + +CREATE TABLE vectors_high_dim ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(128) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +-- Insert high-dimensional vectors (simplified: first few elements differ) +INSERT INTO vectors_high_dim VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'); + +-- Memtable search +SELECT vec_id FROM vectors_high_dim +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +ADMIN FLUSH_TABLE('vectors_high_dim'); + +-- SST index search +SELECT vec_id FROM vectors_high_dim +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +DROP TABLE vectors_high_dim; + +-- ============================================ +-- Part 8: Different k values (LIMIT) +-- ============================================ + +CREATE TABLE vectors_k ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_k VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.8, 0.2, 0.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[0.7, 0.3, 0.0, 0.0]'), + (5, '2024-01-01 00:00:04', '[0.6, 0.4, 0.0, 0.0]'), + (6, '2024-01-01 00:00:05', '[0.5, 0.5, 0.0, 0.0]'), + (7, '2024-01-01 00:00:06', '[0.4, 0.6, 0.0, 0.0]'), + (8, '2024-01-01 00:00:07', '[0.3, 0.7, 0.0, 0.0]'), + (9, '2024-01-01 00:00:08', '[0.2, 0.8, 0.0, 0.0]'), + (10, '2024-01-01 00:00:09', '[0.1, 0.9, 0.0, 0.0]'); + +ADMIN FLUSH_TABLE('vectors_k'); + +-- k=1 +SELECT vec_id FROM vectors_k +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 1; + +-- k=5 +SELECT vec_id FROM vectors_k +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 5; + +-- k=10 (all vectors) +SELECT vec_id FROM vectors_k +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 10; + +DROP TABLE vectors_k; + +-- ============================================ +-- Part 9: Engine parameter tests +-- ============================================ + +-- Create table with explicit engine parameter (usearch is default) +CREATE TABLE vectors_engine ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (engine = 'usearch', metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +-- Insert test vectors +INSERT INTO vectors_engine VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0]'); + +-- Memtable search +SELECT vec_id FROM vectors_engine +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +ADMIN FLUSH_TABLE('vectors_engine'); + +-- SST index search with usearch engine +SELECT vec_id FROM vectors_engine +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id +LIMIT 2; + +DROP TABLE vectors_engine; diff --git a/tests/cases/standalone/common/function/vector/vector_index_explain.result b/tests/cases/standalone/common/function/vector/vector_index_explain.result new file mode 100644 index 0000000000..111267ae74 --- /dev/null +++ b/tests/cases/standalone/common/function/vector/vector_index_explain.result @@ -0,0 +1,270 @@ +-- Vector index explain analyze coverage +-- ============================================ +-- Part 1: Single table KNN explain +-- ============================================ +CREATE TABLE vectors_explain ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_explain VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +ADMIN FLUSH_TABLE('vectors_explain'); + ++--------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_explain') | ++--------------------------------------+ +| 0 | ++--------------------------------------+ + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT vec_id +FROM vectors_explain +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0]'), vec_id +LIMIT 2; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec metrics=REDACTED_| +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| +|_|_|_SortPreservingMergeExec: [vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| +|_|_|_SortExec: TopK(fetch=2), expr=[vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| +|_|_|_CooperativeExec metrics=REDACTED_| +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_| +|_|_| Total rows: REDACTED_| ++-+-+-+ + +DROP TABLE vectors_explain; + +Affected Rows: 0 + +-- ============================================ +-- Part 2: Join with vector order/limit +-- ============================================ +CREATE TABLE vectors_explain_left ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +CREATE TABLE vectors_explain_right ( + vec_id INT, + note STRING, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_explain_left VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +INSERT INTO vectors_explain_right VALUES + (3, 'keep', '2024-01-01 00:00:02'), + (4, 'keep', '2024-01-01 00:00:03'); + +Affected Rows: 2 + +ADMIN FLUSH_TABLE('vectors_explain_left'); + ++-------------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_explain_left') | ++-------------------------------------------+ +| 0 | ++-------------------------------------------+ + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT l.vec_id +FROM vectors_explain_left l +JOIN vectors_explain_right r ON l.vec_id = r.vec_id +ORDER BY vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), l.vec_id +LIMIT 1; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| +|_|_|_SortPreservingMergeExec: [vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=1 metrics=REDACTED_| +|_|_|_SortExec: TopK(fetch=1), expr=[vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| +|_|_|_CoalesceBatchesExec: target_batch_size=8192 metrics=REDACTED_| +|_|_|_HashJoinExec: mode=Partitioned, join_type=Inner, on=[(vec_id@0, vec_id@0)], projection=[vec_id@0, embedding@1] metrics=REDACTED_| +|_|_|_CoalesceBatchesExec: target_batch_size=8192 metrics=REDACTED_| +|_|_|_RepartitionExec: partitioning=Hash([vec_id@0],REDACTED +|_|_|_ProjectionExec: expr=[vec_id@0 as vec_id, embedding@2 as embedding] metrics=REDACTED_| +|_|_|_CooperativeExec metrics=REDACTED_| +|_|_|_MergeScanExec: REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 metrics=REDACTED_| +|_|_|_RepartitionExec: partitioning=Hash([vec_id@0],REDACTED +|_|_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| +|_|_|_CooperativeExec metrics=REDACTED_| +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_CooperativeExec metrics=REDACTED_| +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "ts", "embedding"], "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_| +| 1_| 0_|_CooperativeExec metrics=REDACTED_| +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "projection": ["vec_id", "note", "ts"], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED_| +|_|_|_| +|_|_| Total rows: REDACTED_| ++-+-+-+ + +DROP TABLE vectors_explain_left; + +Affected Rows: 0 + +DROP TABLE vectors_explain_right; + +Affected Rows: 0 + +-- ============================================ +-- Part 3: Cosine and dot explain coverage +-- ============================================ +CREATE TABLE vectors_explain_metric ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'cosine'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_explain_metric VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0]'), + (3, '2024-01-01 00:00:02', '[-1.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[0.0, -1.0]'); + +Affected Rows: 4 + +ADMIN FLUSH_TABLE('vectors_explain_metric'); + ++---------------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_explain_metric') | ++---------------------------------------------+ +| 0 | ++---------------------------------------------+ + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT vec_id +FROM vectors_explain_metric +ORDER BY vec_cos_distance(embedding, '[1.0, 0.0]'), vec_id +LIMIT 2; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec metrics=REDACTED_| +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| +|_|_|_SortPreservingMergeExec: [vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| +|_|_|_SortExec: TopK(fetch=2), expr=[vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| +|_|_|_CooperativeExec metrics=REDACTED_| +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_| +|_|_| Total rows: REDACTED_| ++-+-+-+ + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT vec_id +FROM vectors_explain_metric +ORDER BY vec_dot_product(embedding, '[1.0, 0.0]') DESC, vec_id +LIMIT 2; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec metrics=REDACTED_| +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| +|_|_|_SortPreservingMergeExec: [vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| +|_|_|_SortExec: TopK(fetch=2), expr=[vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| +|_|_|_CooperativeExec metrics=REDACTED_| +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_| +|_|_| Total rows: REDACTED_| ++-+-+-+ + +DROP TABLE vectors_explain_metric; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/vector/vector_index_explain.sql b/tests/cases/standalone/common/function/vector/vector_index_explain.sql new file mode 100644 index 0000000000..50bf3cdbaa --- /dev/null +++ b/tests/cases/standalone/common/function/vector/vector_index_explain.sql @@ -0,0 +1,155 @@ +-- Vector index explain analyze coverage + + +-- ============================================ +-- Part 1: Single table KNN explain +-- ============================================ +CREATE TABLE vectors_explain ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_explain VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +ADMIN FLUSH_TABLE('vectors_explain'); + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT vec_id +FROM vectors_explain +ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0]'), vec_id +LIMIT 2; + +DROP TABLE vectors_explain; + +-- ============================================ +-- Part 2: Join with vector order/limit +-- ============================================ +CREATE TABLE vectors_explain_left ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +CREATE TABLE vectors_explain_right ( + vec_id INT, + note STRING, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_explain_left VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +INSERT INTO vectors_explain_right VALUES + (3, 'keep', '2024-01-01 00:00:02'), + (4, 'keep', '2024-01-01 00:00:03'); + +ADMIN FLUSH_TABLE('vectors_explain_left'); + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT l.vec_id +FROM vectors_explain_left l +JOIN vectors_explain_right r ON l.vec_id = r.vec_id +ORDER BY vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), l.vec_id +LIMIT 1; + +DROP TABLE vectors_explain_left; +DROP TABLE vectors_explain_right; + +-- ============================================ +-- Part 3: Cosine and dot explain coverage +-- ============================================ +CREATE TABLE vectors_explain_metric ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'cosine'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_explain_metric VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.0, 1.0]'), + (3, '2024-01-01 00:00:02', '[-1.0, 0.0]'), + (4, '2024-01-01 00:00:03', '[0.0, -1.0]'); + +ADMIN FLUSH_TABLE('vectors_explain_metric'); + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT vec_id +FROM vectors_explain_metric +ORDER BY vec_cos_distance(embedding, '[1.0, 0.0]'), vec_id +LIMIT 2; + +-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics= +-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED +-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED" +-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED +-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED +-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics= +-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +EXPLAIN ANALYZE VERBOSE +SELECT vec_id +FROM vectors_explain_metric +ORDER BY vec_dot_product(embedding, '[1.0, 0.0]') DESC, vec_id +LIMIT 2; + +DROP TABLE vectors_explain_metric; diff --git a/tests/cases/standalone/common/function/vector/vector_index_join.result b/tests/cases/standalone/common/function/vector/vector_index_join.result new file mode 100644 index 0000000000..51fb393405 --- /dev/null +++ b/tests/cases/standalone/common/function/vector/vector_index_join.result @@ -0,0 +1,240 @@ +-- Vector index join/subquery coverage +-- ============================================ +-- Part 1: Join should not pre-limit left table +-- ============================================ +CREATE TABLE vectors_join_left ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +CREATE TABLE vectors_join_right ( + vec_id INT, + note STRING, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_join_left VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +INSERT INTO vectors_join_right VALUES + (3, 'keep', '2024-01-01 00:00:02'), + (4, 'keep', '2024-01-01 00:00:03'); + +Affected Rows: 2 + +SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist +FROM vectors_join_left l +JOIN vectors_join_right r ON l.vec_id = r.vec_id +ORDER BY dist, l.vec_id +LIMIT 1; + ++--------+------+ +| vec_id | dist | ++--------+------+ +| 4 | 1.81 | ++--------+------+ + +DROP TABLE vectors_join_left; + +Affected Rows: 0 + +DROP TABLE vectors_join_right; + +Affected Rows: 0 + +-- ============================================ +-- Part 2: Subquery should be a barrier +-- ============================================ +CREATE TABLE vectors_subquery ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_subquery VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist +FROM ( + SELECT * FROM vectors_subquery WHERE vec_id >= 3 +) s +ORDER BY dist, s.vec_id +LIMIT 1; + ++--------+------+ +| vec_id | dist | ++--------+------+ +| 4 | 1.81 | ++--------+------+ + +DROP TABLE vectors_subquery; + +Affected Rows: 0 + +-- ============================================ +-- Part 3: LEFT JOIN should not pre-limit +-- ============================================ +CREATE TABLE vectors_left_join ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +CREATE TABLE vectors_left_join_filter ( + vec_id INT, + keep BOOLEAN, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_left_join VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +-- Only vec_id 3,4 have matching rows in filter table +INSERT INTO vectors_left_join_filter VALUES + (3, true, '2024-01-01 00:00:02'), + (4, true, '2024-01-01 00:00:03'); + +Affected Rows: 2 + +-- LEFT JOIN then filter by IS NOT NULL +-- Should return vec_id=4 (dist=1.81), not vec_id=1 or 2 +SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist +FROM vectors_left_join l +LEFT JOIN vectors_left_join_filter r ON l.vec_id = r.vec_id +WHERE r.vec_id IS NOT NULL +ORDER BY dist, l.vec_id +LIMIT 1; + ++--------+------+ +| vec_id | dist | ++--------+------+ +| 4 | 1.81 | ++--------+------+ + +DROP TABLE vectors_left_join; + +Affected Rows: 0 + +DROP TABLE vectors_left_join_filter; + +Affected Rows: 0 + +-- ============================================ +-- Part 4: Inlineable subquery should allow hint +-- ============================================ +CREATE TABLE vectors_inline_subquery ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_inline_subquery VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +ADMIN FLUSH_TABLE('vectors_inline_subquery'); + ++----------------------------------------------+ +| ADMIN FLUSH_TABLE('vectors_inline_subquery') | ++----------------------------------------------+ +| 0 | ++----------------------------------------------+ + +-- Subquery without LIMIT/DISTINCT/aggregation can be inlined +-- Vector hint should be able to push down +SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist +FROM ( + SELECT * FROM vectors_inline_subquery WHERE vec_id >= 1 +) s +ORDER BY dist, s.vec_id +LIMIT 2; + ++--------+------+ +| vec_id | dist | ++--------+------+ +| 1 | 0.0 | +| 2 | 0.01 | ++--------+------+ + +DROP TABLE vectors_inline_subquery; + +Affected Rows: 0 + +-- ============================================ +-- Part 5: CTE should be a barrier +-- ============================================ +CREATE TABLE vectors_cte ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +Affected Rows: 0 + +INSERT INTO vectors_cte VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +Affected Rows: 4 + +-- CTE acts as optimization barrier +-- Filter in CTE limits to vec_id >= 3, so result should be vec_id=4 (dist=1.81) +WITH filtered AS ( + SELECT * FROM vectors_cte WHERE vec_id >= 3 +) +SELECT vec_id, round(vec_l2sq_distance(embedding, '[1.0, 0.0]'), 2) AS dist +FROM filtered +ORDER BY dist, vec_id +LIMIT 1; + ++--------+------+ +| vec_id | dist | ++--------+------+ +| 4 | 1.81 | ++--------+------+ + +DROP TABLE vectors_cte; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/vector/vector_index_join.sql b/tests/cases/standalone/common/function/vector/vector_index_join.sql new file mode 100644 index 0000000000..699942b300 --- /dev/null +++ b/tests/cases/standalone/common/function/vector/vector_index_join.sql @@ -0,0 +1,159 @@ +-- Vector index join/subquery coverage + +-- ============================================ +-- Part 1: Join should not pre-limit left table +-- ============================================ +CREATE TABLE vectors_join_left ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +CREATE TABLE vectors_join_right ( + vec_id INT, + note STRING, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_join_left VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +INSERT INTO vectors_join_right VALUES + (3, 'keep', '2024-01-01 00:00:02'), + (4, 'keep', '2024-01-01 00:00:03'); + +SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist +FROM vectors_join_left l +JOIN vectors_join_right r ON l.vec_id = r.vec_id +ORDER BY dist, l.vec_id +LIMIT 1; + +DROP TABLE vectors_join_left; +DROP TABLE vectors_join_right; + +-- ============================================ +-- Part 2: Subquery should be a barrier +-- ============================================ +CREATE TABLE vectors_subquery ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_subquery VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist +FROM ( + SELECT * FROM vectors_subquery WHERE vec_id >= 3 +) s +ORDER BY dist, s.vec_id +LIMIT 1; + +DROP TABLE vectors_subquery; + +-- ============================================ +-- Part 3: LEFT JOIN should not pre-limit +-- ============================================ +CREATE TABLE vectors_left_join ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +CREATE TABLE vectors_left_join_filter ( + vec_id INT, + keep BOOLEAN, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_left_join VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +-- Only vec_id 3,4 have matching rows in filter table +INSERT INTO vectors_left_join_filter VALUES + (3, true, '2024-01-01 00:00:02'), + (4, true, '2024-01-01 00:00:03'); + +-- LEFT JOIN then filter by IS NOT NULL +-- Should return vec_id=4 (dist=1.81), not vec_id=1 or 2 +SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist +FROM vectors_left_join l +LEFT JOIN vectors_left_join_filter r ON l.vec_id = r.vec_id +WHERE r.vec_id IS NOT NULL +ORDER BY dist, l.vec_id +LIMIT 1; + +DROP TABLE vectors_left_join; +DROP TABLE vectors_left_join_filter; + +-- ============================================ +-- Part 4: Inlineable subquery should allow hint +-- ============================================ +CREATE TABLE vectors_inline_subquery ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_inline_subquery VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +ADMIN FLUSH_TABLE('vectors_inline_subquery'); + +-- Subquery without LIMIT/DISTINCT/aggregation can be inlined +-- Vector hint should be able to push down +SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist +FROM ( + SELECT * FROM vectors_inline_subquery WHERE vec_id >= 1 +) s +ORDER BY dist, s.vec_id +LIMIT 2; + +DROP TABLE vectors_inline_subquery; + +-- ============================================ +-- Part 5: CTE should be a barrier +-- ============================================ +CREATE TABLE vectors_cte ( + vec_id INT, + ts TIMESTAMP TIME INDEX, + embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'), + PRIMARY KEY (vec_id) +); + +INSERT INTO vectors_cte VALUES + (1, '2024-01-01 00:00:00', '[1.0, 0.0]'), + (2, '2024-01-01 00:00:01', '[0.9, 0.0]'), + (3, '2024-01-01 00:00:02', '[0.0, 1.0]'), + (4, '2024-01-01 00:00:03', '[0.0, 0.9]'); + +-- CTE acts as optimization barrier +-- Filter in CTE limits to vec_id >= 3, so result should be vec_id=4 (dist=1.81) +WITH filtered AS ( + SELECT * FROM vectors_cte WHERE vec_id >= 3 +) +SELECT vec_id, round(vec_l2sq_distance(embedding, '[1.0, 0.0]'), 2) AS dist +FROM filtered +ORDER BY dist, vec_id +LIMIT 1; + +DROP TABLE vectors_cte; diff --git a/tests/runner/src/env/bare.rs b/tests/runner/src/env/bare.rs index a0bf67dd97..1501d18512 100644 --- a/tests/runner/src/env/bare.rs +++ b/tests/runner/src/env/bare.rs @@ -528,7 +528,7 @@ impl Env { "--bin", "greptime", "--features", - "pg_kvbackend,mysql_kvbackend", + "pg_kvbackend,mysql_kvbackend,vector_index", ]) .output() .expect("Failed to start GreptimeDB");