test: adds sqlness test for vector index (#7634)

* test: adds sqlness test for vector index

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* fix: CI

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* test: redacted flat map and size

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* test: simplify the replace rules

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: update comments and tests

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
This commit is contained in:
dennis zhuang
2026-02-04 11:54:47 +08:00
committed by GitHub
parent 0f2f20d4b7
commit c08f3a4472
11 changed files with 1901 additions and 15 deletions

View File

@@ -119,7 +119,7 @@ jobs:
- name: Build greptime binaries
shell: bash
# `cargo gc` will invoke `cargo build` with specified args
run: cargo gc -- --bin greptime --bin sqlness-runner --features "pg_kvbackend,mysql_kvbackend"
run: cargo gc -- --bin greptime --bin sqlness-runner --features "pg_kvbackend,mysql_kvbackend,vector_index"
- name: Pack greptime binaries
shell: bash
run: |

View File

@@ -1576,6 +1576,10 @@ impl StreamContext {
let exprs: Vec<_> = predicate.exprs().iter().map(|e| e.to_string()).collect();
write!(f, ", \"filters\": {:?}", exprs)?;
}
#[cfg(feature = "vector_index")]
if let Some(vector_index_k) = self.input.vector_index_k {
write!(f, ", \"vector_index_k\": {}", vector_index_k)?;
}
if !self.input.files.is_empty() {
write!(f, ", \"files\": ")?;
f.debug_list()

View File

@@ -323,14 +323,18 @@ impl TreeNodeVisitor<'_> for ScanHintVisitor {
}
// Avoid carrying vector hints across branching inputs (join/subquery) to prevent
// pruning results before global ordering is applied.
let is_branching = matches!(node, LogicalPlan::Subquery(_)) || node.inputs().len() > 1;
if is_branching && self.ts_row_selector.is_some() {
// pruning results before global ordering is applied. Only treat a subquery as a
// barrier when it contains non-inlineable operators.
let is_branching_for_ts = matches!(
node,
LogicalPlan::Subquery(_) | LogicalPlan::SubqueryAlias(_)
) || node.inputs().len() > 1;
if is_branching_for_ts && self.ts_row_selector.is_some() {
// clean previous time series selector hint when encounter subqueries or join
self.ts_row_selector = None;
}
#[cfg(feature = "vector_index")]
if is_branching {
if is_branching_for_vector(node) {
self.vector_search.on_branching_enter();
}
@@ -371,8 +375,10 @@ impl TreeNodeVisitor<'_> for ScanHintVisitor {
LogicalPlan::Filter(_) => {
self.vector_search.on_filter_exit();
}
LogicalPlan::Subquery(_) => {
self.vector_search.on_branching_exit();
LogicalPlan::Subquery(_) | LogicalPlan::SubqueryAlias(_) => {
if is_branching_for_vector(_node) {
self.vector_search.on_branching_exit();
}
}
_ if _node.inputs().len() > 1 => {
self.vector_search.on_branching_exit();
@@ -398,6 +404,43 @@ impl ScanHintVisitor {
}
}
#[cfg(feature = "vector_index")]
fn is_branching_for_vector(node: &LogicalPlan) -> bool {
if node.inputs().len() > 1 {
return true;
}
match node {
LogicalPlan::Subquery(subquery) => has_non_inlineable_ops(subquery.subquery.as_ref()),
LogicalPlan::SubqueryAlias(alias) => has_non_inlineable_ops(alias.input.as_ref()),
_ => false,
}
}
#[cfg(feature = "vector_index")]
fn has_non_inlineable_ops(plan: &LogicalPlan) -> bool {
if matches!(
plan,
LogicalPlan::Limit(_)
| LogicalPlan::Sort(_)
| LogicalPlan::Distinct(_)
| LogicalPlan::Aggregate(_)
| LogicalPlan::Window(_)
| LogicalPlan::Union(_)
| LogicalPlan::Join(_)
) {
return true;
}
for input in plan.inputs() {
if has_non_inlineable_ops(input) {
return true;
}
}
false
}
#[cfg(test)]
mod test {
use std::sync::Arc;

View File

@@ -35,6 +35,16 @@ use crate::dummy_catalog::DummyTableProvider;
/// - A LIMIT (or Sort.fetch) is present to derive k.
/// - The hint stays within a single input chain (not across join/subquery branches).
/// - The target column is non-nullable, or an explicit IS NOT NULL filter exists.
///
/// Known limitations:
/// - Dynamic overfetching is not implemented yet. When filters exist or ORDER BY includes
/// additional tie-breaker columns (e.g., ORDER BY distance, id), the current fixed k may
/// return incorrect results. A future improvement should dynamically adjust k based on
/// filter selectivity and secondary sort requirements.
/// - Hints only block at subquery boundaries when the subquery contains non-inlineable
/// operators (Limit/Sort/Distinct/Aggregate/Window). Simple subqueries without these
/// operators allow hints to propagate through. In distributed mode, the dist analyzer
/// may inline subqueries before this rule runs, further reducing isolation.
#[derive(Default)]
pub(crate) struct VectorSearchState {
current_distance: Option<VectorDistanceInfo>,
@@ -241,23 +251,47 @@ impl VectorSearchState {
fn extract_distance_from_sort(
sort: &datafusion_expr::logical_plan::Sort,
) -> Option<VectorDistanceInfo> {
if sort.expr.len() != 1 {
debug!(
"Skip vector hint: Sort has {} expressions, expected 1",
sort.expr.len()
);
if sort.expr.is_empty() {
debug!("Skip vector hint: Sort has no expressions");
return None;
}
let sort_expr: &SortExpr = &sort.expr[0];
let info = Self::extract_distance_info(&sort_expr.expr)?;
let expected_asc = info.metric != VectorDistanceMetric::InnerProduct;
if sort_expr.asc == expected_asc {
if sort_expr.asc != expected_asc {
return None;
}
if Self::tie_breakers_allowed(&sort.expr[1..], &info) {
Some(info)
} else {
if sort.expr.len() > 1 {
debug!(
"Skip vector hint: Sort has unsupported tie-breakers ({} expressions)",
sort.expr.len()
);
}
None
}
}
fn tie_breakers_allowed(sort_exprs: &[SortExpr], distance_info: &VectorDistanceInfo) -> bool {
if sort_exprs.is_empty() {
return true;
}
sort_exprs.iter().all(|sort_expr| {
let Expr::Column(col) = &sort_expr.expr else {
return false;
};
match &distance_info.table_reference {
Some(table) => col.relation.as_ref() == Some(table),
None => col.relation.is_none(),
}
})
}
fn extract_limit_info(limit: &datafusion_expr::logical_plan::Limit) -> Option<VectorLimitInfo> {
let fetch = match limit.get_fetch_type().ok()? {
FetchType::Literal(fetch) => fetch?,
@@ -767,8 +801,10 @@ mod tests {
assert!(t2_provider.get_vector_search_hint().is_none());
}
// Simple subqueries (without non-inlineable ops like Limit/Sort/Distinct/Aggregate/Window)
// allow hints to propagate through. See known limitations in VectorSearchState docs.
#[test]
fn test_no_vector_hint_above_subquery() {
fn test_simple_subquery_allows_hint_propagation() {
let provider = build_dummy_provider(10);
let table_source = Arc::new(DefaultTableSource::new(provider.clone()));
let scan_plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
@@ -794,6 +830,42 @@ mod tests {
let context = OptimizerContext::default();
let _ = ScanHintRule.rewrite(plan, &context).unwrap();
// Hint propagates through simple subquery
let hint = provider.get_vector_search_hint().unwrap();
assert_eq!(hint.k, 5);
}
// Subqueries with non-inlineable ops (Limit/Sort/Distinct/Aggregate/Window) block hint propagation.
#[test]
fn test_subquery_with_limit_blocks_hint() {
let provider = build_dummy_provider(10);
let table_source = Arc::new(DefaultTableSource::new(provider.clone()));
let scan_plan = LogicalPlanBuilder::scan_with_filters("t", table_source, None, vec![])
.unwrap()
.limit(0, Some(100)) // non-inlineable op inside subquery
.unwrap()
.build()
.unwrap();
let subquery = LogicalPlan::Subquery(Subquery {
subquery: Arc::new(scan_plan),
outer_ref_columns: vec![],
spans: Default::default(),
});
let expr = vec_distance_expr(VEC_L2SQ_DISTANCE);
let plan = LogicalPlanBuilder::from(subquery)
.sort(vec![expr.sort(true, false)])
.unwrap()
.limit(0, Some(5))
.unwrap()
.build()
.unwrap();
let context = OptimizerContext::default();
let _ = ScanHintRule.rewrite(plan, &context).unwrap();
// Hint does NOT propagate through subquery with non-inlineable ops
assert!(provider.get_vector_search_hint().is_none());
}

View File

@@ -0,0 +1,602 @@
-- Test vector index creation and KNN search
-- ============================================
-- Part 1: Basic L2 squared distance tests
-- ============================================
-- Create a table with vector column and L2sq vector index
CREATE TABLE vectors_l2sq (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
-- Insert test vectors
INSERT INTO vectors_l2sq VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.0, 0.0, 1.0]'),
(5, '2024-01-01 00:00:04', '[1.0, 1.0, 0.0, 0.0]'),
(6, '2024-01-01 00:00:05', '[0.0, 1.0, 1.0, 0.0]'),
(7, '2024-01-01 00:00:06', '[0.0, 0.0, 1.0, 1.0]'),
(8, '2024-01-01 00:00:07', '[1.0, 0.0, 0.0, 1.0]');
Affected Rows: 8
-- Query BEFORE flush (memtable search)
-- Expected: vec_id=1 (distance=0), vec_id=5 (distance=1), vec_id=8 (distance=1)
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_l2sq
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+----------+
| vec_id | distance |
+--------+----------+
| 1 | 0.0 |
| 5 | 1.0 |
| 8 | 1.0 |
+--------+----------+
-- Flush to create SST files with vector index
ADMIN FLUSH_TABLE('vectors_l2sq');
+-----------------------------------+
| ADMIN FLUSH_TABLE('vectors_l2sq') |
+-----------------------------------+
| 0 |
+-----------------------------------+
-- Query AFTER flush (SST index search)
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_l2sq
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+----------+
| vec_id | distance |
+--------+----------+
| 1 | 0.0 |
| 5 | 1.0 |
| 8 | 1.0 |
+--------+----------+
-- Query with different target vector
-- Expected: vec_id=6 (distance=0), vec_id=2 (distance=1), vec_id=3 (distance=1)
SELECT vec_id, vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]') as distance
FROM vectors_l2sq
ORDER BY vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]'), vec_id
LIMIT 3;
+--------+----------+
| vec_id | distance |
+--------+----------+
| 6 | 0.0 |
| 2 | 1.0 |
| 3 | 1.0 |
+--------+----------+
DROP TABLE vectors_l2sq;
Affected Rows: 0
-- ============================================
-- Part 2: Cosine distance tests
-- ============================================
CREATE TABLE vectors_cosine (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'cosine'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
-- Insert vectors with different magnitudes but same/different directions
INSERT INTO vectors_cosine VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]');
Affected Rows: 5
-- Memtable search with cosine distance
-- vec_id=1 and vec_id=2 should have same cosine distance (0) since they point same direction
SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_cosine
ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+------------+
| vec_id | distance |
+--------+------------+
| 1 | 0.0 |
| 2 | 0.0 |
| 4 | 0.29289323 |
+--------+------------+
ADMIN FLUSH_TABLE('vectors_cosine');
+-------------------------------------+
| ADMIN FLUSH_TABLE('vectors_cosine') |
+-------------------------------------+
| 0 |
+-------------------------------------+
-- SST index search with cosine distance
SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_cosine
ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+------------+
| vec_id | distance |
+--------+------------+
| 1 | 0.0 |
| 2 | 0.0 |
| 4 | 0.29289323 |
+--------+------------+
DROP TABLE vectors_cosine;
Affected Rows: 0
-- ============================================
-- Part 3: Dot product (inner product) tests
-- ============================================
CREATE TABLE vectors_dot (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'dot'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_dot VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]');
Affected Rows: 5
-- Memtable search with dot product
-- Larger dot product means more similar, so we use negative for ordering
-- vec_id=2 should be best (dot=2), vec_id=1 and vec_id=4 have dot=1
SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product
FROM vectors_dot
ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id
LIMIT 3;
+--------+-------------+
| vec_id | dot_product |
+--------+-------------+
| 2 | 2.0 |
| 1 | 1.0 |
| 4 | 1.0 |
+--------+-------------+
ADMIN FLUSH_TABLE('vectors_dot');
+----------------------------------+
| ADMIN FLUSH_TABLE('vectors_dot') |
+----------------------------------+
| 0 |
+----------------------------------+
-- SST index search with dot product
SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product
FROM vectors_dot
ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id
LIMIT 3;
+--------+-------------+
| vec_id | dot_product |
+--------+-------------+
| 2 | 2.0 |
| 1 | 1.0 |
| 4 | 1.0 |
+--------+-------------+
DROP TABLE vectors_dot;
Affected Rows: 0
-- ============================================
-- Part 4: NULL vector handling tests
-- ============================================
CREATE TABLE vectors_null (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
-- Insert vectors with some NULLs
INSERT INTO vectors_null VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', NULL),
(3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', NULL),
(5, '2024-01-01 00:00:04', '[0.0, 0.0, 1.0, 0.0]');
Affected Rows: 5
-- Memtable search should skip NULL vectors
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_null
WHERE embedding IS NOT NULL
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+----------+
| vec_id | distance |
+--------+----------+
| 1 | 0.0 |
| 3 | 2.0 |
| 5 | 2.0 |
+--------+----------+
ADMIN FLUSH_TABLE('vectors_null');
+-----------------------------------+
| ADMIN FLUSH_TABLE('vectors_null') |
+-----------------------------------+
| 0 |
+-----------------------------------+
-- SST index search should also skip NULL vectors
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_null
WHERE embedding IS NOT NULL
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+----------+
| vec_id | distance |
+--------+----------+
| 1 | 0.0 |
| 3 | 2.0 |
| 5 | 2.0 |
+--------+----------+
DROP TABLE vectors_null;
Affected Rows: 0
-- ============================================
-- Part 5: Mixed memtable + SST search tests
-- ============================================
CREATE TABLE vectors_mixed (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
-- Insert first batch and flush to SST
INSERT INTO vectors_mixed VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]');
Affected Rows: 3
ADMIN FLUSH_TABLE('vectors_mixed');
+------------------------------------+
| ADMIN FLUSH_TABLE('vectors_mixed') |
+------------------------------------+
| 0 |
+------------------------------------+
-- Insert second batch (stays in memtable)
INSERT INTO vectors_mixed VALUES
(4, '2024-01-01 00:00:03', '[0.5, 0.5, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[0.9, 0.1, 0.0, 0.0]');
Affected Rows: 2
-- Query should search both SST (vec_id 1,2,3) and memtable (vec_id 4,5)
-- Target: [1.0, 0.0, 0.0, 0.0]
-- Expected: vec_id=1 (dist=0), vec_id=5 (dist=0.02), vec_id=4 (dist=0.5)
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_mixed
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
+--------+-------------+
| vec_id | distance |
+--------+-------------+
| 1 | 0.0 |
| 5 | 0.020000005 |
| 4 | 0.5 |
+--------+-------------+
DROP TABLE vectors_mixed;
Affected Rows: 0
-- ============================================
-- Part 6: KNN with WHERE clause tests
-- ============================================
CREATE TABLE vectors_filter (
vec_id INT,
category STRING,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_filter VALUES
(1, 'A', '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, 'B', '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'),
(3, 'A', '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, 'B', '2024-01-01 00:00:03', '[0.1, 0.9, 0.0, 0.0]'),
(5, 'A', '2024-01-01 00:00:04', '[0.5, 0.5, 0.0, 0.0]');
Affected Rows: 5
-- Memtable search with filter
SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_filter
WHERE category = 'A'
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+----------+----------+
| vec_id | category | distance |
+--------+----------+----------+
| 1 | A | 0.0 |
| 5 | A | 0.5 |
+--------+----------+----------+
ADMIN FLUSH_TABLE('vectors_filter');
+-------------------------------------+
| ADMIN FLUSH_TABLE('vectors_filter') |
+-------------------------------------+
| 0 |
+-------------------------------------+
-- SST index search with filter
SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_filter
WHERE category = 'A'
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+----------+----------+
| vec_id | category | distance |
+--------+----------+----------+
| 1 | A | 0.0 |
| 5 | A | 0.5 |
+--------+----------+----------+
-- Filter with time range
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_filter
WHERE ts >= '2024-01-01 00:00:02' AND ts <= '2024-01-01 00:00:04'
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+-----------+
| vec_id | distance |
+--------+-----------+
| 5 | 0.5 |
| 4 | 1.6199999 |
+--------+-----------+
DROP TABLE vectors_filter;
Affected Rows: 0
-- ============================================
-- Part 7: Higher dimension vectors
-- ============================================
CREATE TABLE vectors_high_dim (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(128) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
-- Insert high-dimensional vectors (simplified: first few elements differ)
INSERT INTO vectors_high_dim VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]');
Affected Rows: 3
-- Memtable search
SELECT vec_id FROM vectors_high_dim
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+
| vec_id |
+--------+
| 1 |
| 3 |
+--------+
ADMIN FLUSH_TABLE('vectors_high_dim');
+---------------------------------------+
| ADMIN FLUSH_TABLE('vectors_high_dim') |
+---------------------------------------+
| 0 |
+---------------------------------------+
-- SST index search
SELECT vec_id FROM vectors_high_dim
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+
| vec_id |
+--------+
| 1 |
| 3 |
+--------+
DROP TABLE vectors_high_dim;
Affected Rows: 0
-- ============================================
-- Part 8: Different k values (LIMIT)
-- ============================================
CREATE TABLE vectors_k (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_k VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.8, 0.2, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[0.7, 0.3, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[0.6, 0.4, 0.0, 0.0]'),
(6, '2024-01-01 00:00:05', '[0.5, 0.5, 0.0, 0.0]'),
(7, '2024-01-01 00:00:06', '[0.4, 0.6, 0.0, 0.0]'),
(8, '2024-01-01 00:00:07', '[0.3, 0.7, 0.0, 0.0]'),
(9, '2024-01-01 00:00:08', '[0.2, 0.8, 0.0, 0.0]'),
(10, '2024-01-01 00:00:09', '[0.1, 0.9, 0.0, 0.0]');
Affected Rows: 10
ADMIN FLUSH_TABLE('vectors_k');
+--------------------------------+
| ADMIN FLUSH_TABLE('vectors_k') |
+--------------------------------+
| 0 |
+--------------------------------+
-- k=1
SELECT vec_id FROM vectors_k
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 1;
+--------+
| vec_id |
+--------+
| 1 |
+--------+
-- k=5
SELECT vec_id FROM vectors_k
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 5;
+--------+
| vec_id |
+--------+
| 1 |
| 2 |
| 3 |
| 4 |
| 5 |
+--------+
-- k=10 (all vectors)
SELECT vec_id FROM vectors_k
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 10;
+--------+
| vec_id |
+--------+
| 1 |
| 2 |
| 3 |
| 4 |
| 5 |
| 6 |
| 7 |
| 8 |
| 9 |
| 10 |
+--------+
DROP TABLE vectors_k;
Affected Rows: 0
-- ============================================
-- Part 9: Engine parameter tests
-- ============================================
-- Create table with explicit engine parameter (usearch is default)
CREATE TABLE vectors_engine (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (engine = 'usearch', metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
-- Insert test vectors
INSERT INTO vectors_engine VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0]');
Affected Rows: 3
-- Memtable search
SELECT vec_id FROM vectors_engine
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+
| vec_id |
+--------+
| 1 |
| 3 |
+--------+
ADMIN FLUSH_TABLE('vectors_engine');
+-------------------------------------+
| ADMIN FLUSH_TABLE('vectors_engine') |
+-------------------------------------+
| 0 |
+-------------------------------------+
-- SST index search with usearch engine
SELECT vec_id FROM vectors_engine
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
+--------+
| vec_id |
+--------+
| 1 |
| 3 |
+--------+
DROP TABLE vectors_engine;
Affected Rows: 0

View File

@@ -0,0 +1,341 @@
-- Test vector index creation and KNN search
-- ============================================
-- Part 1: Basic L2 squared distance tests
-- ============================================
-- Create a table with vector column and L2sq vector index
CREATE TABLE vectors_l2sq (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
-- Insert test vectors
INSERT INTO vectors_l2sq VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.0, 0.0, 1.0]'),
(5, '2024-01-01 00:00:04', '[1.0, 1.0, 0.0, 0.0]'),
(6, '2024-01-01 00:00:05', '[0.0, 1.0, 1.0, 0.0]'),
(7, '2024-01-01 00:00:06', '[0.0, 0.0, 1.0, 1.0]'),
(8, '2024-01-01 00:00:07', '[1.0, 0.0, 0.0, 1.0]');
-- Query BEFORE flush (memtable search)
-- Expected: vec_id=1 (distance=0), vec_id=5 (distance=1), vec_id=8 (distance=1)
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_l2sq
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
-- Flush to create SST files with vector index
ADMIN FLUSH_TABLE('vectors_l2sq');
-- Query AFTER flush (SST index search)
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_l2sq
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
-- Query with different target vector
-- Expected: vec_id=6 (distance=0), vec_id=2 (distance=1), vec_id=3 (distance=1)
SELECT vec_id, vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]') as distance
FROM vectors_l2sq
ORDER BY vec_l2sq_distance(embedding, '[0.0, 1.0, 1.0, 0.0]'), vec_id
LIMIT 3;
DROP TABLE vectors_l2sq;
-- ============================================
-- Part 2: Cosine distance tests
-- ============================================
CREATE TABLE vectors_cosine (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'cosine'),
PRIMARY KEY (vec_id)
);
-- Insert vectors with different magnitudes but same/different directions
INSERT INTO vectors_cosine VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]');
-- Memtable search with cosine distance
-- vec_id=1 and vec_id=2 should have same cosine distance (0) since they point same direction
SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_cosine
ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
ADMIN FLUSH_TABLE('vectors_cosine');
-- SST index search with cosine distance
SELECT vec_id, vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_cosine
ORDER BY vec_cos_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
DROP TABLE vectors_cosine;
-- ============================================
-- Part 3: Dot product (inner product) tests
-- ============================================
CREATE TABLE vectors_dot (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'dot'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_dot VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[2.0, 0.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[1.0, 1.0, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[-1.0, 0.0, 0.0, 0.0]');
-- Memtable search with dot product
-- Larger dot product means more similar, so we use negative for ordering
-- vec_id=2 should be best (dot=2), vec_id=1 and vec_id=4 have dot=1
SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product
FROM vectors_dot
ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id
LIMIT 3;
ADMIN FLUSH_TABLE('vectors_dot');
-- SST index search with dot product
SELECT vec_id, vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') as dot_product
FROM vectors_dot
ORDER BY vec_dot_product(embedding, '[1.0, 0.0, 0.0, 0.0]') DESC, vec_id
LIMIT 3;
DROP TABLE vectors_dot;
-- ============================================
-- Part 4: NULL vector handling tests
-- ============================================
CREATE TABLE vectors_null (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
-- Insert vectors with some NULLs
INSERT INTO vectors_null VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', NULL),
(3, '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', NULL),
(5, '2024-01-01 00:00:04', '[0.0, 0.0, 1.0, 0.0]');
-- Memtable search should skip NULL vectors
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_null
WHERE embedding IS NOT NULL
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
ADMIN FLUSH_TABLE('vectors_null');
-- SST index search should also skip NULL vectors
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_null
WHERE embedding IS NOT NULL
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
DROP TABLE vectors_null;
-- ============================================
-- Part 5: Mixed memtable + SST search tests
-- ============================================
CREATE TABLE vectors_mixed (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
-- Insert first batch and flush to SST
INSERT INTO vectors_mixed VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 0.0, 1.0, 0.0]');
ADMIN FLUSH_TABLE('vectors_mixed');
-- Insert second batch (stays in memtable)
INSERT INTO vectors_mixed VALUES
(4, '2024-01-01 00:00:03', '[0.5, 0.5, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[0.9, 0.1, 0.0, 0.0]');
-- Query should search both SST (vec_id 1,2,3) and memtable (vec_id 4,5)
-- Target: [1.0, 0.0, 0.0, 0.0]
-- Expected: vec_id=1 (dist=0), vec_id=5 (dist=0.02), vec_id=4 (dist=0.5)
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_mixed
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 3;
DROP TABLE vectors_mixed;
-- ============================================
-- Part 6: KNN with WHERE clause tests
-- ============================================
CREATE TABLE vectors_filter (
vec_id INT,
category STRING,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_filter VALUES
(1, 'A', '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, 'B', '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'),
(3, 'A', '2024-01-01 00:00:02', '[0.0, 1.0, 0.0, 0.0]'),
(4, 'B', '2024-01-01 00:00:03', '[0.1, 0.9, 0.0, 0.0]'),
(5, 'A', '2024-01-01 00:00:04', '[0.5, 0.5, 0.0, 0.0]');
-- Memtable search with filter
SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_filter
WHERE category = 'A'
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
ADMIN FLUSH_TABLE('vectors_filter');
-- SST index search with filter
SELECT vec_id, category, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_filter
WHERE category = 'A'
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
-- Filter with time range
SELECT vec_id, vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]') as distance
FROM vectors_filter
WHERE ts >= '2024-01-01 00:00:02' AND ts <= '2024-01-01 00:00:04'
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
DROP TABLE vectors_filter;
-- ============================================
-- Part 7: Higher dimension vectors
-- ============================================
CREATE TABLE vectors_high_dim (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(128) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
-- Insert high-dimensional vectors (simplified: first few elements differ)
INSERT INTO vectors_high_dim VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]');
-- Memtable search
SELECT vec_id FROM vectors_high_dim
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
ADMIN FLUSH_TABLE('vectors_high_dim');
-- SST index search
SELECT vec_id FROM vectors_high_dim
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
DROP TABLE vectors_high_dim;
-- ============================================
-- Part 8: Different k values (LIMIT)
-- ============================================
CREATE TABLE vectors_k (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_k VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.1, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.8, 0.2, 0.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[0.7, 0.3, 0.0, 0.0]'),
(5, '2024-01-01 00:00:04', '[0.6, 0.4, 0.0, 0.0]'),
(6, '2024-01-01 00:00:05', '[0.5, 0.5, 0.0, 0.0]'),
(7, '2024-01-01 00:00:06', '[0.4, 0.6, 0.0, 0.0]'),
(8, '2024-01-01 00:00:07', '[0.3, 0.7, 0.0, 0.0]'),
(9, '2024-01-01 00:00:08', '[0.2, 0.8, 0.0, 0.0]'),
(10, '2024-01-01 00:00:09', '[0.1, 0.9, 0.0, 0.0]');
ADMIN FLUSH_TABLE('vectors_k');
-- k=1
SELECT vec_id FROM vectors_k
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 1;
-- k=5
SELECT vec_id FROM vectors_k
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 5;
-- k=10 (all vectors)
SELECT vec_id FROM vectors_k
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 10;
DROP TABLE vectors_k;
-- ============================================
-- Part 9: Engine parameter tests
-- ============================================
-- Create table with explicit engine parameter (usearch is default)
CREATE TABLE vectors_engine (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(4) NOT NULL VECTOR INDEX WITH (engine = 'usearch', metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
-- Insert test vectors
INSERT INTO vectors_engine VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0, 0.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0, 0.0, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.5, 0.5, 0.0, 0.0]');
-- Memtable search
SELECT vec_id FROM vectors_engine
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
ADMIN FLUSH_TABLE('vectors_engine');
-- SST index search with usearch engine
SELECT vec_id FROM vectors_engine
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0, 0.0, 0.0]'), vec_id
LIMIT 2;
DROP TABLE vectors_engine;

View File

@@ -0,0 +1,270 @@
-- Vector index explain analyze coverage
-- ============================================
-- Part 1: Single table KNN explain
-- ============================================
CREATE TABLE vectors_explain (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_explain VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
ADMIN FLUSH_TABLE('vectors_explain');
+--------------------------------------+
| ADMIN FLUSH_TABLE('vectors_explain') |
+--------------------------------------+
| 0 |
+--------------------------------------+
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT vec_id
FROM vectors_explain
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0]'), vec_id
LIMIT 2;
+-+-+-+
| stage | node | plan_|
+-+-+-+
| 0_| 0_|_CooperativeExec metrics=REDACTED_|
|_|_|_MergeScanExec: REDACTED
|_|_|_|
| 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_|
|_|_|_SortPreservingMergeExec: [vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_|
|_|_|_SortExec: TopK(fetch=2), expr=[vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_|
|_|_|_CooperativeExec metrics=REDACTED_|
|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED |
|_|_|_|
|_|_| Total rows: REDACTED_|
+-+-+-+
DROP TABLE vectors_explain;
Affected Rows: 0
-- ============================================
-- Part 2: Join with vector order/limit
-- ============================================
CREATE TABLE vectors_explain_left (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
CREATE TABLE vectors_explain_right (
vec_id INT,
note STRING,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_explain_left VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
INSERT INTO vectors_explain_right VALUES
(3, 'keep', '2024-01-01 00:00:02'),
(4, 'keep', '2024-01-01 00:00:03');
Affected Rows: 2
ADMIN FLUSH_TABLE('vectors_explain_left');
+-------------------------------------------+
| ADMIN FLUSH_TABLE('vectors_explain_left') |
+-------------------------------------------+
| 0 |
+-------------------------------------------+
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT l.vec_id
FROM vectors_explain_left l
JOIN vectors_explain_right r ON l.vec_id = r.vec_id
ORDER BY vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), l.vec_id
LIMIT 1;
+-+-+-+
| stage | node | plan_|
+-+-+-+
| 0_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_|
|_|_|_SortPreservingMergeExec: [vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=1 metrics=REDACTED_|
|_|_|_SortExec: TopK(fetch=1), expr=[vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_|
|_|_|_CoalesceBatchesExec: target_batch_size=8192 metrics=REDACTED_|
|_|_|_HashJoinExec: mode=Partitioned, join_type=Inner, on=[(vec_id@0, vec_id@0)], projection=[vec_id@0, embedding@1] metrics=REDACTED_|
|_|_|_CoalesceBatchesExec: target_batch_size=8192 metrics=REDACTED_|
|_|_|_RepartitionExec: partitioning=Hash([vec_id@0],REDACTED
|_|_|_ProjectionExec: expr=[vec_id@0 as vec_id, embedding@2 as embedding] metrics=REDACTED_|
|_|_|_CooperativeExec metrics=REDACTED_|
|_|_|_MergeScanExec: REDACTED
|_|_|_CoalesceBatchesExec: target_batch_size=8192 metrics=REDACTED_|
|_|_|_RepartitionExec: partitioning=Hash([vec_id@0],REDACTED
|_|_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_|
|_|_|_CooperativeExec metrics=REDACTED_|
|_|_|_MergeScanExec: REDACTED
|_|_|_|
| 1_| 0_|_CooperativeExec metrics=REDACTED_|
|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "ts", "embedding"], "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED |
|_|_|_|
| 1_| 0_|_CooperativeExec metrics=REDACTED_|
|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "projection": ["vec_id", "note", "ts"], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED_|
|_|_|_|
|_|_| Total rows: REDACTED_|
+-+-+-+
DROP TABLE vectors_explain_left;
Affected Rows: 0
DROP TABLE vectors_explain_right;
Affected Rows: 0
-- ============================================
-- Part 3: Cosine and dot explain coverage
-- ============================================
CREATE TABLE vectors_explain_metric (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'cosine'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_explain_metric VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0]'),
(3, '2024-01-01 00:00:02', '[-1.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[0.0, -1.0]');
Affected Rows: 4
ADMIN FLUSH_TABLE('vectors_explain_metric');
+---------------------------------------------+
| ADMIN FLUSH_TABLE('vectors_explain_metric') |
+---------------------------------------------+
| 0 |
+---------------------------------------------+
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT vec_id
FROM vectors_explain_metric
ORDER BY vec_cos_distance(embedding, '[1.0, 0.0]'), vec_id
LIMIT 2;
+-+-+-+
| stage | node | plan_|
+-+-+-+
| 0_| 0_|_CooperativeExec metrics=REDACTED_|
|_|_|_MergeScanExec: REDACTED
|_|_|_|
| 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_|
|_|_|_SortPreservingMergeExec: [vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_|
|_|_|_SortExec: TopK(fetch=2), expr=[vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_|
|_|_|_CooperativeExec metrics=REDACTED_|
|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED |
|_|_|_|
|_|_| Total rows: REDACTED_|
+-+-+-+
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT vec_id
FROM vectors_explain_metric
ORDER BY vec_dot_product(embedding, '[1.0, 0.0]') DESC, vec_id
LIMIT 2;
+-+-+-+
| stage | node | plan_|
+-+-+-+
| 0_| 0_|_CooperativeExec metrics=REDACTED_|
|_|_|_MergeScanExec: REDACTED
|_|_|_|
| 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_|
|_|_|_SortPreservingMergeExec: [vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_|
|_|_|_SortExec: TopK(fetch=2), expr=[vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_|
|_|_|_CooperativeExec metrics=REDACTED_|
|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":902}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED |
|_|_|_|
|_|_| Total rows: REDACTED_|
+-+-+-+
DROP TABLE vectors_explain_metric;
Affected Rows: 0

View File

@@ -0,0 +1,155 @@
-- Vector index explain analyze coverage
-- ============================================
-- Part 1: Single table KNN explain
-- ============================================
CREATE TABLE vectors_explain (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_explain VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
ADMIN FLUSH_TABLE('vectors_explain');
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT vec_id
FROM vectors_explain
ORDER BY vec_l2sq_distance(embedding, '[1.0, 0.0]'), vec_id
LIMIT 2;
DROP TABLE vectors_explain;
-- ============================================
-- Part 2: Join with vector order/limit
-- ============================================
CREATE TABLE vectors_explain_left (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
CREATE TABLE vectors_explain_right (
vec_id INT,
note STRING,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_explain_left VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
INSERT INTO vectors_explain_right VALUES
(3, 'keep', '2024-01-01 00:00:02'),
(4, 'keep', '2024-01-01 00:00:03');
ADMIN FLUSH_TABLE('vectors_explain_left');
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT l.vec_id
FROM vectors_explain_left l
JOIN vectors_explain_right r ON l.vec_id = r.vec_id
ORDER BY vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), l.vec_id
LIMIT 1;
DROP TABLE vectors_explain_left;
DROP TABLE vectors_explain_right;
-- ============================================
-- Part 3: Cosine and dot explain coverage
-- ============================================
CREATE TABLE vectors_explain_metric (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'cosine'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_explain_metric VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.0, 1.0]'),
(3, '2024-01-01 00:00:02', '[-1.0, 0.0]'),
(4, '2024-01-01 00:00:03', '[0.0, -1.0]');
ADMIN FLUSH_TABLE('vectors_explain_metric');
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT vec_id
FROM vectors_explain_metric
ORDER BY vec_cos_distance(embedding, '[1.0, 0.0]'), vec_id
LIMIT 2;
-- SQLNESS REPLACE ("metrics_per_partition":\s*.*metrics=) "metrics_per_partition": REDACTED metrics=
-- SQLNESS REPLACE (metrics=\{.*\}) metrics=REDACTED
-- SQLNESS REPLACE (metrics=\[[^\]]*\]) metrics=REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
-- SQLNESS REPLACE Hash\(\[vec_id@0\],.* Hash([vec_id@0],REDACTED
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE "(file_id|time_range_start|time_range_end)":"[^"]+" "$1":"REDACTED"
-- SQLNESS REPLACE ("[a-z_]+":"[0-9\.]+(ns|us|µs|ms|s)") "DURATION": REDACTED
-- SQLNESS REPLACE "(size|flat_format)":\s*(\d+|true|false) "$1":REDACTED
-- SQLNESS REPLACE ,\s*filter=.*?metrics= metrics=
-- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE VERBOSE
SELECT vec_id
FROM vectors_explain_metric
ORDER BY vec_dot_product(embedding, '[1.0, 0.0]') DESC, vec_id
LIMIT 2;
DROP TABLE vectors_explain_metric;

View File

@@ -0,0 +1,240 @@
-- Vector index join/subquery coverage
-- ============================================
-- Part 1: Join should not pre-limit left table
-- ============================================
CREATE TABLE vectors_join_left (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
CREATE TABLE vectors_join_right (
vec_id INT,
note STRING,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_join_left VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
INSERT INTO vectors_join_right VALUES
(3, 'keep', '2024-01-01 00:00:02'),
(4, 'keep', '2024-01-01 00:00:03');
Affected Rows: 2
SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist
FROM vectors_join_left l
JOIN vectors_join_right r ON l.vec_id = r.vec_id
ORDER BY dist, l.vec_id
LIMIT 1;
+--------+------+
| vec_id | dist |
+--------+------+
| 4 | 1.81 |
+--------+------+
DROP TABLE vectors_join_left;
Affected Rows: 0
DROP TABLE vectors_join_right;
Affected Rows: 0
-- ============================================
-- Part 2: Subquery should be a barrier
-- ============================================
CREATE TABLE vectors_subquery (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_subquery VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist
FROM (
SELECT * FROM vectors_subquery WHERE vec_id >= 3
) s
ORDER BY dist, s.vec_id
LIMIT 1;
+--------+------+
| vec_id | dist |
+--------+------+
| 4 | 1.81 |
+--------+------+
DROP TABLE vectors_subquery;
Affected Rows: 0
-- ============================================
-- Part 3: LEFT JOIN should not pre-limit
-- ============================================
CREATE TABLE vectors_left_join (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
CREATE TABLE vectors_left_join_filter (
vec_id INT,
keep BOOLEAN,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_left_join VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
-- Only vec_id 3,4 have matching rows in filter table
INSERT INTO vectors_left_join_filter VALUES
(3, true, '2024-01-01 00:00:02'),
(4, true, '2024-01-01 00:00:03');
Affected Rows: 2
-- LEFT JOIN then filter by IS NOT NULL
-- Should return vec_id=4 (dist=1.81), not vec_id=1 or 2
SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist
FROM vectors_left_join l
LEFT JOIN vectors_left_join_filter r ON l.vec_id = r.vec_id
WHERE r.vec_id IS NOT NULL
ORDER BY dist, l.vec_id
LIMIT 1;
+--------+------+
| vec_id | dist |
+--------+------+
| 4 | 1.81 |
+--------+------+
DROP TABLE vectors_left_join;
Affected Rows: 0
DROP TABLE vectors_left_join_filter;
Affected Rows: 0
-- ============================================
-- Part 4: Inlineable subquery should allow hint
-- ============================================
CREATE TABLE vectors_inline_subquery (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_inline_subquery VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
ADMIN FLUSH_TABLE('vectors_inline_subquery');
+----------------------------------------------+
| ADMIN FLUSH_TABLE('vectors_inline_subquery') |
+----------------------------------------------+
| 0 |
+----------------------------------------------+
-- Subquery without LIMIT/DISTINCT/aggregation can be inlined
-- Vector hint should be able to push down
SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist
FROM (
SELECT * FROM vectors_inline_subquery WHERE vec_id >= 1
) s
ORDER BY dist, s.vec_id
LIMIT 2;
+--------+------+
| vec_id | dist |
+--------+------+
| 1 | 0.0 |
| 2 | 0.01 |
+--------+------+
DROP TABLE vectors_inline_subquery;
Affected Rows: 0
-- ============================================
-- Part 5: CTE should be a barrier
-- ============================================
CREATE TABLE vectors_cte (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
Affected Rows: 0
INSERT INTO vectors_cte VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
Affected Rows: 4
-- CTE acts as optimization barrier
-- Filter in CTE limits to vec_id >= 3, so result should be vec_id=4 (dist=1.81)
WITH filtered AS (
SELECT * FROM vectors_cte WHERE vec_id >= 3
)
SELECT vec_id, round(vec_l2sq_distance(embedding, '[1.0, 0.0]'), 2) AS dist
FROM filtered
ORDER BY dist, vec_id
LIMIT 1;
+--------+------+
| vec_id | dist |
+--------+------+
| 4 | 1.81 |
+--------+------+
DROP TABLE vectors_cte;
Affected Rows: 0

View File

@@ -0,0 +1,159 @@
-- Vector index join/subquery coverage
-- ============================================
-- Part 1: Join should not pre-limit left table
-- ============================================
CREATE TABLE vectors_join_left (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
CREATE TABLE vectors_join_right (
vec_id INT,
note STRING,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_join_left VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
INSERT INTO vectors_join_right VALUES
(3, 'keep', '2024-01-01 00:00:02'),
(4, 'keep', '2024-01-01 00:00:03');
SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist
FROM vectors_join_left l
JOIN vectors_join_right r ON l.vec_id = r.vec_id
ORDER BY dist, l.vec_id
LIMIT 1;
DROP TABLE vectors_join_left;
DROP TABLE vectors_join_right;
-- ============================================
-- Part 2: Subquery should be a barrier
-- ============================================
CREATE TABLE vectors_subquery (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_subquery VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist
FROM (
SELECT * FROM vectors_subquery WHERE vec_id >= 3
) s
ORDER BY dist, s.vec_id
LIMIT 1;
DROP TABLE vectors_subquery;
-- ============================================
-- Part 3: LEFT JOIN should not pre-limit
-- ============================================
CREATE TABLE vectors_left_join (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
CREATE TABLE vectors_left_join_filter (
vec_id INT,
keep BOOLEAN,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_left_join VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
-- Only vec_id 3,4 have matching rows in filter table
INSERT INTO vectors_left_join_filter VALUES
(3, true, '2024-01-01 00:00:02'),
(4, true, '2024-01-01 00:00:03');
-- LEFT JOIN then filter by IS NOT NULL
-- Should return vec_id=4 (dist=1.81), not vec_id=1 or 2
SELECT l.vec_id, round(vec_l2sq_distance(l.embedding, '[1.0, 0.0]'), 2) AS dist
FROM vectors_left_join l
LEFT JOIN vectors_left_join_filter r ON l.vec_id = r.vec_id
WHERE r.vec_id IS NOT NULL
ORDER BY dist, l.vec_id
LIMIT 1;
DROP TABLE vectors_left_join;
DROP TABLE vectors_left_join_filter;
-- ============================================
-- Part 4: Inlineable subquery should allow hint
-- ============================================
CREATE TABLE vectors_inline_subquery (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_inline_subquery VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
ADMIN FLUSH_TABLE('vectors_inline_subquery');
-- Subquery without LIMIT/DISTINCT/aggregation can be inlined
-- Vector hint should be able to push down
SELECT s.vec_id, round(vec_l2sq_distance(s.embedding, '[1.0, 0.0]'), 2) AS dist
FROM (
SELECT * FROM vectors_inline_subquery WHERE vec_id >= 1
) s
ORDER BY dist, s.vec_id
LIMIT 2;
DROP TABLE vectors_inline_subquery;
-- ============================================
-- Part 5: CTE should be a barrier
-- ============================================
CREATE TABLE vectors_cte (
vec_id INT,
ts TIMESTAMP TIME INDEX,
embedding VECTOR(2) NOT NULL VECTOR INDEX WITH (metric = 'l2sq'),
PRIMARY KEY (vec_id)
);
INSERT INTO vectors_cte VALUES
(1, '2024-01-01 00:00:00', '[1.0, 0.0]'),
(2, '2024-01-01 00:00:01', '[0.9, 0.0]'),
(3, '2024-01-01 00:00:02', '[0.0, 1.0]'),
(4, '2024-01-01 00:00:03', '[0.0, 0.9]');
-- CTE acts as optimization barrier
-- Filter in CTE limits to vec_id >= 3, so result should be vec_id=4 (dist=1.81)
WITH filtered AS (
SELECT * FROM vectors_cte WHERE vec_id >= 3
)
SELECT vec_id, round(vec_l2sq_distance(embedding, '[1.0, 0.0]'), 2) AS dist
FROM filtered
ORDER BY dist, vec_id
LIMIT 1;
DROP TABLE vectors_cte;

View File

@@ -528,7 +528,7 @@ impl Env {
"--bin",
"greptime",
"--features",
"pg_kvbackend,mysql_kvbackend",
"pg_kvbackend,mysql_kvbackend,vector_index",
])
.output()
.expect("Failed to start GreptimeDB");