mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
this expose IVF_RQ (RabitQ quantization) index type to lancedb --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2088 lines
64 KiB
TypeScript
2088 lines
64 KiB
TypeScript
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
import * as fs from "fs";
|
|
import * as path from "path";
|
|
import * as tmp from "tmp";
|
|
|
|
import * as arrow15 from "apache-arrow-15";
|
|
import * as arrow16 from "apache-arrow-16";
|
|
import * as arrow17 from "apache-arrow-17";
|
|
import * as arrow18 from "apache-arrow-18";
|
|
|
|
import {
|
|
Connection,
|
|
MatchQuery,
|
|
PhraseQuery,
|
|
Table,
|
|
connect,
|
|
} from "../lancedb";
|
|
import {
|
|
Table as ArrowTable,
|
|
Field,
|
|
FixedSizeList,
|
|
Float32,
|
|
Float64,
|
|
Int32,
|
|
Int64,
|
|
List,
|
|
Schema,
|
|
SchemaLike,
|
|
Type,
|
|
Uint8,
|
|
Utf8,
|
|
makeArrowTable,
|
|
} from "../lancedb/arrow";
|
|
import * as arrow from "../lancedb/arrow";
|
|
import {
|
|
EmbeddingFunction,
|
|
LanceSchema,
|
|
getRegistry,
|
|
register,
|
|
} from "../lancedb/embedding";
|
|
import { Index } from "../lancedb/indices";
|
|
import {
|
|
BooleanQuery,
|
|
Occur,
|
|
Operator,
|
|
instanceOfFullTextQuery,
|
|
} from "../lancedb/query";
|
|
|
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|
"Given a table",
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
(arrow: any) => {
|
|
let tmpDir: tmp.DirResult;
|
|
let table: Table;
|
|
|
|
const schema:
|
|
| import("apache-arrow-15").Schema
|
|
| import("apache-arrow-16").Schema
|
|
| import("apache-arrow-17").Schema
|
|
| import("apache-arrow-18").Schema = new arrow.Schema([
|
|
new arrow.Field("id", new arrow.Float64(), true),
|
|
]);
|
|
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const conn = await connect(tmpDir.name);
|
|
table = await conn.createEmptyTable("some_table", schema);
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
it("be displayable", async () => {
|
|
expect(table.display()).toMatch(
|
|
/NativeTable\(some_table, uri=.*, read_consistency_interval=None\)/,
|
|
);
|
|
table.close();
|
|
expect(table.display()).toBe("ClosedTable(some_table)");
|
|
});
|
|
|
|
it("should let me add data", async () => {
|
|
await table.add([{ id: 1 }, { id: 2 }]);
|
|
await table.add([{ id: 1 }]);
|
|
await expect(table.countRows()).resolves.toBe(3);
|
|
});
|
|
|
|
it("should show table stats", async () => {
|
|
await table.add([{ id: 1 }, { id: 2 }]);
|
|
await table.add([{ id: 1 }]);
|
|
await expect(table.stats()).resolves.toEqual({
|
|
fragmentStats: {
|
|
lengths: {
|
|
max: 2,
|
|
mean: 1,
|
|
min: 1,
|
|
p25: 1,
|
|
p50: 2,
|
|
p75: 2,
|
|
p99: 2,
|
|
},
|
|
numFragments: 2,
|
|
numSmallFragments: 2,
|
|
},
|
|
numIndices: 0,
|
|
numRows: 3,
|
|
totalBytes: 24,
|
|
});
|
|
});
|
|
|
|
it("should overwrite data if asked", async () => {
|
|
const addRes = await table.add([{ id: 1 }, { id: 2 }]);
|
|
expect(addRes).toHaveProperty("version");
|
|
expect(addRes.version).toBe(2);
|
|
await table.add([{ id: 1 }], { mode: "overwrite" });
|
|
await expect(table.countRows()).resolves.toBe(1);
|
|
});
|
|
|
|
it("should let me close the table", async () => {
|
|
expect(table.isOpen()).toBe(true);
|
|
table.close();
|
|
expect(table.isOpen()).toBe(false);
|
|
expect(table.countRows()).rejects.toThrow("Table some_table is closed");
|
|
});
|
|
|
|
it("should let me update values", async () => {
|
|
await table.add([{ id: 1 }]);
|
|
expect(await table.countRows("id == 1")).toBe(1);
|
|
expect(await table.countRows("id == 7")).toBe(0);
|
|
const updateRes = await table.update({ id: "7" });
|
|
expect(updateRes).toHaveProperty("version");
|
|
expect(updateRes.version).toBe(3);
|
|
expect(updateRes).toHaveProperty("rowsUpdated");
|
|
expect(updateRes.rowsUpdated).toBe(1);
|
|
expect(await table.countRows("id == 1")).toBe(0);
|
|
expect(await table.countRows("id == 7")).toBe(1);
|
|
await table.add([{ id: 2 }]);
|
|
// Test Map as input
|
|
await table.update(new Map(Object.entries({ id: "10" })), {
|
|
where: "id % 2 == 0",
|
|
});
|
|
expect(await table.countRows("id == 2")).toBe(0);
|
|
expect(await table.countRows("id == 7")).toBe(1);
|
|
expect(await table.countRows("id == 10")).toBe(1);
|
|
});
|
|
|
|
it("should let me update values with `values`", async () => {
|
|
await table.add([{ id: 1 }]);
|
|
expect(await table.countRows("id == 1")).toBe(1);
|
|
expect(await table.countRows("id == 7")).toBe(0);
|
|
await table.update({ values: { id: 7 } });
|
|
expect(await table.countRows("id == 1")).toBe(0);
|
|
expect(await table.countRows("id == 7")).toBe(1);
|
|
await table.add([{ id: 2 }]);
|
|
// Test Map as input
|
|
await table.update({
|
|
values: {
|
|
id: "10",
|
|
},
|
|
where: "id % 2 == 0",
|
|
});
|
|
expect(await table.countRows("id == 2")).toBe(0);
|
|
expect(await table.countRows("id == 7")).toBe(1);
|
|
expect(await table.countRows("id == 10")).toBe(1);
|
|
});
|
|
|
|
it("should let me update values with `valuesSql`", async () => {
|
|
await table.add([{ id: 1 }]);
|
|
expect(await table.countRows("id == 1")).toBe(1);
|
|
expect(await table.countRows("id == 7")).toBe(0);
|
|
await table.update({
|
|
valuesSql: {
|
|
id: "7",
|
|
},
|
|
});
|
|
expect(await table.countRows("id == 1")).toBe(0);
|
|
expect(await table.countRows("id == 7")).toBe(1);
|
|
await table.add([{ id: 2 }]);
|
|
// Test Map as input
|
|
await table.update({
|
|
valuesSql: {
|
|
id: "10",
|
|
},
|
|
where: "id % 2 == 0",
|
|
});
|
|
expect(await table.countRows("id == 2")).toBe(0);
|
|
expect(await table.countRows("id == 7")).toBe(1);
|
|
expect(await table.countRows("id == 10")).toBe(1);
|
|
});
|
|
|
|
// https://github.com/lancedb/lancedb/issues/1293
|
|
test.each([new arrow.Float16(), new arrow.Float32(), new arrow.Float64()])(
|
|
"can create empty table with non default float type: %s",
|
|
async (floatType) => {
|
|
const db = await connect(tmpDir.name);
|
|
|
|
const data = [
|
|
{ text: "hello", vector: Array(512).fill(1.0) },
|
|
{ text: "hello world", vector: Array(512).fill(1.0) },
|
|
];
|
|
const f64Schema = new arrow.Schema([
|
|
new arrow.Field("text", new arrow.Utf8(), true),
|
|
new arrow.Field(
|
|
"vector",
|
|
new arrow.FixedSizeList(512, new arrow.Field("item", floatType)),
|
|
true,
|
|
),
|
|
]);
|
|
|
|
const f64Table = await db.createEmptyTable("f64", f64Schema, {
|
|
mode: "overwrite",
|
|
});
|
|
try {
|
|
await f64Table.add(data);
|
|
const res = await f64Table.query().toArray();
|
|
expect(res.length).toBe(2);
|
|
} catch (e) {
|
|
expect(e).toBeUndefined();
|
|
}
|
|
},
|
|
);
|
|
|
|
it("should be able to omit nullable fields", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const schema = new arrow.Schema([
|
|
new arrow.Field(
|
|
"vector",
|
|
new arrow.FixedSizeList(
|
|
2,
|
|
new arrow.Field("item", new arrow.Float64()),
|
|
),
|
|
true,
|
|
),
|
|
new arrow.Field("item", new arrow.Utf8(), true),
|
|
new arrow.Field("price", new arrow.Float64(), false),
|
|
]);
|
|
const table = await db.createEmptyTable("test", schema);
|
|
|
|
const data1 = { item: "foo", price: 10.0 };
|
|
await table.add([data1]);
|
|
const data2 = { vector: [3.1, 4.1], price: 2.0 };
|
|
await table.add([data2]);
|
|
const data3 = { vector: [5.9, 26.5], item: "bar", price: 3.0 };
|
|
await table.add([data3]);
|
|
|
|
let res = await table.query().limit(10).toArray();
|
|
const resVector = res.map((r) =>
|
|
r.vector ? Array.from(r.vector) : null,
|
|
);
|
|
expect(resVector).toEqual([null, data2.vector, data3.vector]);
|
|
const resItem = res.map((r) => r.item);
|
|
expect(resItem).toEqual(["foo", null, "bar"]);
|
|
const resPrice = res.map((r) => r.price);
|
|
expect(resPrice).toEqual([10.0, 2.0, 3.0]);
|
|
|
|
const data4 = { item: "foo" };
|
|
// We can't omit a column if it's not nullable
|
|
await expect(table.add([data4])).rejects.toThrow(
|
|
"Append with different schema",
|
|
);
|
|
|
|
// But we can alter columns to make them nullable
|
|
await table.alterColumns([{ path: "price", nullable: true }]);
|
|
await table.add([data4]);
|
|
|
|
res = (await table.query().limit(10).toArray()).map((r) => ({
|
|
...r.toJSON(),
|
|
vector: r.vector ? Array.from(r.vector) : null,
|
|
}));
|
|
// Rust fills missing nullable fields with null
|
|
expect(res).toEqual([
|
|
{ ...data1, vector: null },
|
|
{ ...data2, item: null },
|
|
data3,
|
|
{ ...data4, price: null, vector: null },
|
|
]);
|
|
});
|
|
|
|
it("should be able to insert nullable data for non-nullable fields", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const schema = new arrow.Schema([
|
|
new arrow.Field("x", new arrow.Float64(), false),
|
|
new arrow.Field("id", new arrow.Utf8(), false),
|
|
]);
|
|
const table = await db.createEmptyTable("test", schema);
|
|
|
|
const data1 = { x: 4.1, id: "foo" };
|
|
await table.add([data1]);
|
|
const res = (await table.query().toArray())[0];
|
|
expect(res.x).toEqual(data1.x);
|
|
expect(res.id).toEqual(data1.id);
|
|
|
|
const data2 = { x: null, id: "bar" };
|
|
await expect(table.add([data2])).rejects.toThrow(
|
|
"declared as non-nullable but contains null values",
|
|
);
|
|
|
|
// But we can alter columns to make them nullable
|
|
await table.alterColumns([{ path: "x", nullable: true }]);
|
|
await table.add([data2]);
|
|
|
|
const res2 = await table.query().toArray();
|
|
expect(res2.length).toBe(2);
|
|
expect(res2[0].x).toEqual(data1.x);
|
|
expect(res2[0].id).toEqual(data1.id);
|
|
expect(res2[1].x).toBeNull();
|
|
expect(res2[1].id).toEqual(data2.id);
|
|
});
|
|
|
|
it("should support take queries", async () => {
|
|
await table.add([{ id: 1 }, { id: 2 }, { id: 3 }]);
|
|
const res = await table.takeOffsets([1, 2]).toArrow();
|
|
expect(res.getChild("id")?.toJSON()).toEqual([2, 3]);
|
|
});
|
|
|
|
it("should return the table as an instance of an arrow table", async () => {
|
|
const arrowTbl = await table.toArrow();
|
|
expect(arrowTbl).toBeInstanceOf(ArrowTable);
|
|
});
|
|
|
|
it("should be able to handle missing fields", async () => {
|
|
const schema = new arrow.Schema([
|
|
new arrow.Field("id", new arrow.Int32(), true),
|
|
new arrow.Field("y", new arrow.Int32(), true),
|
|
new arrow.Field("z", new arrow.Int64(), true),
|
|
]);
|
|
const db = await connect(tmpDir.name);
|
|
const table = await db.createEmptyTable("testNull", schema);
|
|
await table.add([{ id: 1, y: 2 }]);
|
|
await table.add([{ id: 2 }]);
|
|
|
|
await table
|
|
.mergeInsert("id")
|
|
.whenNotMatchedInsertAll()
|
|
.execute([
|
|
{ id: 3, z: 3 },
|
|
{ id: 4, z: 5 },
|
|
]);
|
|
|
|
const res = await table.query().toArrow();
|
|
expect(res.getChild("id")?.toJSON()).toEqual([1, 2, 3, 4]);
|
|
expect(res.getChild("y")?.toJSON()).toEqual([2, null, null, null]);
|
|
expect(res.getChild("z")?.toJSON()).toEqual([null, null, 3n, 5n]);
|
|
});
|
|
|
|
it("should handle null vectors at end of data", async () => {
|
|
// https://github.com/lancedb/lancedb/issues/2240
|
|
const data = [{ vector: [1, 2, 3] }, { vector: null }];
|
|
const db = await connect("memory://");
|
|
|
|
const table = await db.createTable("my_table", data);
|
|
expect(await table.countRows()).toEqual(2);
|
|
});
|
|
|
|
it("should allow undefined and omitted nullable vector fields", async () => {
|
|
// Test for the bug: can't pass undefined or omit vector column
|
|
const db = await connect("memory://");
|
|
const schema = new arrow.Schema([
|
|
new arrow.Field("id", new arrow.Int32(), true),
|
|
new arrow.Field(
|
|
"vector",
|
|
new arrow.FixedSizeList(
|
|
32,
|
|
new arrow.Field("item", new arrow.Float32(), true),
|
|
),
|
|
true, // nullable = true
|
|
),
|
|
]);
|
|
const table = await db.createEmptyTable("test_table", schema);
|
|
|
|
// Should not throw error for undefined value
|
|
await table.add([{ id: 0, vector: undefined }]);
|
|
|
|
// Should not throw error for omitted field
|
|
await table.add([{ id: 1 }]);
|
|
|
|
// Should still work for null
|
|
await table.add([{ id: 2, vector: null }]);
|
|
|
|
// Should still work for actual vector
|
|
const testVector = new Array(32).fill(0.5);
|
|
await table.add([{ id: 3, vector: testVector }]);
|
|
expect(await table.countRows()).toEqual(4);
|
|
|
|
const res = await table.query().limit(10).toArray();
|
|
const resVector = res.map((r) =>
|
|
r.vector ? Array.from(r.vector) : null,
|
|
);
|
|
expect(resVector).toEqual([null, null, null, testVector]);
|
|
});
|
|
},
|
|
);
|
|
|
|
describe("merge insert", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let table: Table;
|
|
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const conn = await connect(tmpDir.name);
|
|
|
|
table = await conn.createTable("some_table", [
|
|
{ a: 1, b: "a" },
|
|
{ a: 2, b: "b" },
|
|
{ a: 3, b: "c" },
|
|
]);
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
test("upsert", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 3, b: "y" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
const mergeInsertRes = await table
|
|
.mergeInsert("a")
|
|
.whenMatchedUpdateAll()
|
|
.whenNotMatchedInsertAll()
|
|
.execute(newData, { timeoutMs: 10_000 });
|
|
expect(mergeInsertRes).toHaveProperty("version");
|
|
expect(mergeInsertRes.version).toBe(2);
|
|
expect(mergeInsertRes.numInsertedRows).toBe(1);
|
|
expect(mergeInsertRes.numUpdatedRows).toBe(2);
|
|
expect(mergeInsertRes.numDeletedRows).toBe(0);
|
|
|
|
const expected = [
|
|
{ a: 1, b: "a" },
|
|
{ a: 2, b: "x" },
|
|
{ a: 3, b: "y" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
|
|
const result = (await table.toArrow()).toArray().sort((a, b) => a.a - b.a);
|
|
|
|
expect(result.map((row) => ({ ...row }))).toEqual(expected);
|
|
});
|
|
test("conditional update", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 3, b: "y" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
const mergeInsertRes = await table
|
|
.mergeInsert("a")
|
|
.whenMatchedUpdateAll({ where: "target.b = 'b'" })
|
|
.execute(newData);
|
|
expect(mergeInsertRes).toHaveProperty("version");
|
|
expect(mergeInsertRes.version).toBe(2);
|
|
|
|
const expected = [
|
|
{ a: 1, b: "a" },
|
|
{ a: 2, b: "x" },
|
|
{ a: 3, b: "c" },
|
|
];
|
|
// round trip to arrow and back to json to avoid comparing arrow objects to js object
|
|
// biome-ignore lint/suspicious/noExplicitAny: test
|
|
let res: any[] = JSON.parse(
|
|
JSON.stringify((await table.toArrow()).toArray()),
|
|
);
|
|
res = res.sort((a, b) => a.a - b.a);
|
|
|
|
expect(res).toEqual(expected);
|
|
});
|
|
|
|
test("insert if not exists", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 3, b: "y" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
await table.mergeInsert("a").whenNotMatchedInsertAll().execute(newData);
|
|
const expected = [
|
|
{ a: 1, b: "a" },
|
|
{ a: 2, b: "b" },
|
|
{ a: 3, b: "c" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
let res: any[] = JSON.parse(
|
|
JSON.stringify((await table.toArrow()).toArray()),
|
|
);
|
|
res = res.sort((a, b) => a.a - b.a);
|
|
expect(res).toEqual(expected);
|
|
});
|
|
test("replace range", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
await table
|
|
.mergeInsert("a")
|
|
.whenMatchedUpdateAll()
|
|
.whenNotMatchedInsertAll()
|
|
.whenNotMatchedBySourceDelete({ where: "a > 2" })
|
|
.execute(newData);
|
|
|
|
const expected = [
|
|
{ a: 1, b: "a" },
|
|
{ a: 2, b: "x" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
let res: any[] = JSON.parse(
|
|
JSON.stringify((await table.toArrow()).toArray()),
|
|
);
|
|
res = res.sort((a, b) => a.a - b.a);
|
|
expect(res).toEqual(expected);
|
|
});
|
|
test("replace range no condition", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
await table
|
|
.mergeInsert("a")
|
|
.whenMatchedUpdateAll()
|
|
.whenNotMatchedInsertAll()
|
|
.whenNotMatchedBySourceDelete()
|
|
.execute(newData);
|
|
|
|
const expected = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
|
|
// biome-ignore lint/suspicious/noExplicitAny: test
|
|
let res: any[] = JSON.parse(
|
|
JSON.stringify((await table.toArrow()).toArray()),
|
|
);
|
|
res = res.sort((a, b) => a.a - b.a);
|
|
expect(res).toEqual(expected);
|
|
});
|
|
|
|
test("timeout", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
await expect(
|
|
table
|
|
.mergeInsert("a")
|
|
.whenMatchedUpdateAll()
|
|
.whenNotMatchedInsertAll()
|
|
.execute(newData, { timeoutMs: 0 }),
|
|
).rejects.toThrow("merge insert timed out");
|
|
});
|
|
|
|
test("useIndex", async () => {
|
|
const newData = [
|
|
{ a: 2, b: "x" },
|
|
{ a: 4, b: "z" },
|
|
];
|
|
|
|
// Test with useIndex(true) - should work fine
|
|
const result1 = await table
|
|
.mergeInsert("a")
|
|
.whenNotMatchedInsertAll()
|
|
.useIndex(true)
|
|
.execute(newData);
|
|
|
|
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
|
|
|
|
// Test with useIndex(false) - should also work fine
|
|
const newData2 = [{ a: 5, b: "w" }];
|
|
const result2 = await table
|
|
.mergeInsert("a")
|
|
.whenNotMatchedInsertAll()
|
|
.useIndex(false)
|
|
.execute(newData2);
|
|
|
|
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
|
|
});
|
|
});
|
|
|
|
describe("When creating an index", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
const schema = new Schema([
|
|
new Field("id", new Int32(), true),
|
|
new Field("vec", new FixedSizeList(32, new Field("item", new Float32()))),
|
|
new Field("tags", new List(new Field("item", new Utf8(), true))),
|
|
]);
|
|
let tbl: Table;
|
|
let queryVec: number[];
|
|
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const db = await connect(tmpDir.name);
|
|
const data = makeArrowTable(
|
|
Array(300)
|
|
.fill(1)
|
|
.map((_, i) => ({
|
|
id: i,
|
|
vec: Array(32)
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
tags: ["tag1", "tag2", "tag3"],
|
|
})),
|
|
{
|
|
schema,
|
|
},
|
|
);
|
|
queryVec = data.toArray()[5].vec.toJSON();
|
|
tbl = await db.createTable("test", data);
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
it("should create a vector index on vector columns", async () => {
|
|
await tbl.createIndex("vec");
|
|
|
|
// check index directory
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
const indices = await tbl.listIndices();
|
|
expect(indices.length).toBe(1);
|
|
expect(indices[0]).toEqual({
|
|
name: "vec_idx",
|
|
indexType: "IvfPq",
|
|
columns: ["vec"],
|
|
});
|
|
const stats = await tbl.indexStats("vec_idx");
|
|
expect(stats?.loss).toBeDefined();
|
|
|
|
// Search without specifying the column
|
|
let rst = await tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(queryVec)
|
|
.distanceType("dot")
|
|
.toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// Search using `vectorSearch`
|
|
rst = await tbl.vectorSearch(queryVec).limit(2).toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// Search with specifying the column
|
|
const rst2 = await tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(queryVec)
|
|
.column("vec")
|
|
.toArrow();
|
|
expect(rst2.numRows).toBe(2);
|
|
expect(rst.toString()).toEqual(rst2.toString());
|
|
|
|
// test offset
|
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// test nprobes
|
|
rst = await tbl.query().nearestTo(queryVec).limit(2).nprobes(50).toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
rst = await tbl
|
|
.query()
|
|
.nearestTo(queryVec)
|
|
.limit(2)
|
|
.minimumNprobes(15)
|
|
.toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
rst = await tbl
|
|
.query()
|
|
.nearestTo(queryVec)
|
|
.limit(2)
|
|
.minimumNprobes(10)
|
|
.maximumNprobes(20)
|
|
.toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
expect(() => tbl.query().nearestTo(queryVec).minimumNprobes(0)).toThrow(
|
|
"Invalid input, minimum_nprobes must be greater than 0",
|
|
);
|
|
expect(() => tbl.query().nearestTo(queryVec).maximumNprobes(5)).toThrow(
|
|
"Invalid input, maximum_nprobes must be greater than or equal to minimum_nprobes",
|
|
);
|
|
|
|
await tbl.dropIndex("vec_idx");
|
|
const indices2 = await tbl.listIndices();
|
|
expect(indices2.length).toBe(0);
|
|
});
|
|
|
|
it("should wait for index readiness", async () => {
|
|
// Create an index and then wait for it to be ready
|
|
await tbl.createIndex("vec");
|
|
const indices = await tbl.listIndices();
|
|
expect(indices.length).toBeGreaterThan(0);
|
|
const idxName = indices[0].name;
|
|
await expect(tbl.waitForIndex([idxName], 5)).resolves.toBeUndefined();
|
|
});
|
|
|
|
it("should search with distance range", async () => {
|
|
await tbl.createIndex("vec");
|
|
|
|
const rst = await tbl.query().limit(10).nearestTo(queryVec).toArrow();
|
|
const distanceColumn = rst.getChild("_distance");
|
|
let minDist = undefined;
|
|
let maxDist = undefined;
|
|
if (distanceColumn) {
|
|
minDist = distanceColumn.get(0);
|
|
maxDist = distanceColumn.get(9);
|
|
}
|
|
|
|
const rst2 = await tbl
|
|
.query()
|
|
.limit(10)
|
|
.nearestTo(queryVec)
|
|
.distanceRange(minDist, maxDist)
|
|
.toArrow();
|
|
const distanceColumn2 = rst2.getChild("_distance");
|
|
expect(distanceColumn2).toBeDefined();
|
|
if (distanceColumn2) {
|
|
for await (const d of distanceColumn2) {
|
|
expect(d).toBeGreaterThanOrEqual(minDist);
|
|
expect(d).toBeLessThan(maxDist);
|
|
}
|
|
}
|
|
|
|
const rst3 = await tbl
|
|
.query()
|
|
.limit(10)
|
|
.nearestTo(queryVec)
|
|
.distanceRange(maxDist, undefined)
|
|
.toArrow();
|
|
const distanceColumn3 = rst3.getChild("_distance");
|
|
expect(distanceColumn3).toBeDefined();
|
|
if (distanceColumn3) {
|
|
for await (const d of distanceColumn3) {
|
|
expect(d).toBeGreaterThanOrEqual(maxDist);
|
|
}
|
|
}
|
|
|
|
const rst4 = await tbl
|
|
.query()
|
|
.limit(10)
|
|
.nearestTo(queryVec)
|
|
.distanceRange(undefined, minDist)
|
|
.toArrow();
|
|
const distanceColumn4 = rst4.getChild("_distance");
|
|
expect(distanceColumn4).toBeDefined();
|
|
if (distanceColumn4) {
|
|
for await (const d of distanceColumn4) {
|
|
expect(d).toBeLessThan(minDist);
|
|
}
|
|
}
|
|
});
|
|
|
|
it("should create and search IVF_HNSW indices", async () => {
|
|
await tbl.createIndex("vec", {
|
|
config: Index.hnswSq(),
|
|
});
|
|
|
|
// check index directory
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
const indices = await tbl.listIndices();
|
|
expect(indices.length).toBe(1);
|
|
expect(indices[0]).toEqual({
|
|
name: "vec_idx",
|
|
indexType: "IvfHnswSq",
|
|
columns: ["vec"],
|
|
});
|
|
|
|
// Search without specifying the column
|
|
let rst = await tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(queryVec)
|
|
.distanceType("dot")
|
|
.toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// Search using `vectorSearch`
|
|
rst = await tbl.vectorSearch(queryVec).limit(2).toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// Search with specifying the column
|
|
const rst2 = await tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(queryVec)
|
|
.column("vec")
|
|
.toArrow();
|
|
expect(rst2.numRows).toBe(2);
|
|
expect(rst.toString()).toEqual(rst2.toString());
|
|
|
|
// test offset
|
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// test ef
|
|
rst = await tbl.query().limit(2).nearestTo(queryVec).ef(100).toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
});
|
|
|
|
it("should be able to query unindexed data", async () => {
|
|
await tbl.createIndex("vec");
|
|
await tbl.add([
|
|
{
|
|
id: 300,
|
|
vec: Array(32)
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
tags: [],
|
|
},
|
|
]);
|
|
|
|
const plan1 = await tbl.query().nearestTo(queryVec).explainPlan(true);
|
|
expect(plan1).toMatch("LanceScan");
|
|
|
|
const plan2 = await tbl
|
|
.query()
|
|
.nearestTo(queryVec)
|
|
.fastSearch()
|
|
.explainPlan(true);
|
|
expect(plan2).not.toMatch("LanceScan");
|
|
});
|
|
|
|
it("should be able to run analyze plan", async () => {
|
|
await tbl.createIndex("vec");
|
|
await tbl.add([
|
|
{
|
|
id: 300,
|
|
vec: Array(32)
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
tags: [],
|
|
},
|
|
]);
|
|
|
|
const plan = await tbl.query().nearestTo(queryVec).analyzePlan();
|
|
expect(plan).toMatch("AnalyzeExec");
|
|
expect(plan).toMatch("metrics=");
|
|
});
|
|
|
|
it("should be able to query with row id", async () => {
|
|
const results = await tbl
|
|
.query()
|
|
.nearestTo(queryVec)
|
|
.withRowId()
|
|
.limit(1)
|
|
.toArray();
|
|
expect(results.length).toBe(1);
|
|
expect(results[0]).toHaveProperty("_rowid");
|
|
});
|
|
|
|
it("should allow parameters to be specified", async () => {
|
|
await tbl.createIndex("vec", {
|
|
config: Index.ivfPq({
|
|
numPartitions: 10,
|
|
}),
|
|
});
|
|
|
|
// TODO: Verify parameters when we can load index config as part of list indices
|
|
});
|
|
|
|
it("should be able to create 4bit IVF_PQ", async () => {
|
|
await tbl.createIndex("vec", {
|
|
config: Index.ivfPq({
|
|
numPartitions: 10,
|
|
numBits: 4,
|
|
}),
|
|
});
|
|
});
|
|
|
|
it("should be able to create IVF_RQ", async () => {
|
|
await tbl.createIndex("vec", {
|
|
config: Index.ivfRq({
|
|
numPartitions: 10,
|
|
numBits: 1,
|
|
}),
|
|
});
|
|
});
|
|
|
|
it("should allow me to replace (or not) an existing index", async () => {
|
|
await tbl.createIndex("id");
|
|
// Default is replace=true
|
|
await tbl.createIndex("id");
|
|
await expect(tbl.createIndex("id", { replace: false })).rejects.toThrow(
|
|
"already exists",
|
|
);
|
|
await tbl.createIndex("id", { replace: true });
|
|
});
|
|
|
|
test("should create a scalar index on scalar columns", async () => {
|
|
await tbl.createIndex("id");
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
|
|
for await (const r of tbl.query().where("id > 1").select(["id"])) {
|
|
expect(r.numRows).toBe(298);
|
|
}
|
|
// should also work with 'filter' alias
|
|
for await (const r of tbl.query().filter("id > 1").select(["id"])) {
|
|
expect(r.numRows).toBe(298);
|
|
}
|
|
});
|
|
|
|
test("create a bitmap index", async () => {
|
|
await tbl.createIndex("id", {
|
|
config: Index.bitmap(),
|
|
});
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
});
|
|
|
|
test("create a hnswPq index", async () => {
|
|
await tbl.createIndex("vec", {
|
|
config: Index.hnswPq({
|
|
numPartitions: 10,
|
|
}),
|
|
});
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
});
|
|
|
|
test("create a HnswSq index", async () => {
|
|
await tbl.createIndex("vec", {
|
|
config: Index.hnswSq({
|
|
numPartitions: 10,
|
|
}),
|
|
});
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
});
|
|
|
|
test("create a label list index", async () => {
|
|
await tbl.createIndex("tags", {
|
|
config: Index.labelList(),
|
|
});
|
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
|
});
|
|
|
|
test("should be able to get index stats", async () => {
|
|
await tbl.createIndex("id");
|
|
|
|
const stats = await tbl.indexStats("id_idx");
|
|
expect(stats).toBeDefined();
|
|
expect(stats?.numIndexedRows).toEqual(300);
|
|
expect(stats?.numUnindexedRows).toEqual(0);
|
|
expect(stats?.distanceType).toBeUndefined();
|
|
expect(stats?.indexType).toEqual("BTREE");
|
|
expect(stats?.numIndices).toEqual(1);
|
|
expect(stats?.loss).toBeUndefined();
|
|
});
|
|
|
|
test("when getting stats on non-existent index", async () => {
|
|
const stats = await tbl.indexStats("some non-existent index");
|
|
expect(stats).toBeUndefined();
|
|
});
|
|
|
|
test("should support name and train parameters", async () => {
|
|
// Test with custom name
|
|
await tbl.createIndex("vec", {
|
|
config: Index.ivfPq({ numPartitions: 4 }),
|
|
name: "my_custom_vector_index",
|
|
});
|
|
|
|
const indices = await tbl.listIndices();
|
|
expect(indices).toHaveLength(1);
|
|
expect(indices[0].name).toBe("my_custom_vector_index");
|
|
|
|
// Test scalar index with train=false
|
|
await tbl.createIndex("id", {
|
|
config: Index.btree(),
|
|
name: "btree_empty",
|
|
train: false,
|
|
});
|
|
|
|
const allIndices = await tbl.listIndices();
|
|
expect(allIndices).toHaveLength(2);
|
|
expect(allIndices.some((idx) => idx.name === "btree_empty")).toBe(true);
|
|
|
|
// Test with both name and train=true (use tags column)
|
|
await tbl.createIndex("tags", {
|
|
config: Index.labelList(),
|
|
name: "tags_trained",
|
|
train: true,
|
|
});
|
|
|
|
const finalIndices = await tbl.listIndices();
|
|
expect(finalIndices).toHaveLength(3);
|
|
expect(finalIndices.some((idx) => idx.name === "tags_trained")).toBe(true);
|
|
});
|
|
|
|
test("create ivf_flat with binary vectors", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const binarySchema = new Schema([
|
|
new Field("id", new Int32(), true),
|
|
new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
|
|
]);
|
|
const tbl = await db.createTable(
|
|
"binary",
|
|
makeArrowTable(
|
|
Array(300)
|
|
.fill(1)
|
|
.map((_, i) => ({
|
|
id: i,
|
|
vec: Array(32)
|
|
.fill(1)
|
|
.map(() => Math.floor(Math.random() * 255)),
|
|
})),
|
|
{ schema: binarySchema },
|
|
),
|
|
);
|
|
await tbl.createIndex("vec", {
|
|
config: Index.ivfFlat({ numPartitions: 10, distanceType: "hamming" }),
|
|
});
|
|
|
|
// query with binary vectors
|
|
const queryVec = Array(32)
|
|
.fill(1)
|
|
.map(() => Math.floor(Math.random() * 255));
|
|
const rst = await tbl.query().limit(5).nearestTo(queryVec).toArrow();
|
|
expect(rst.numRows).toBe(5);
|
|
});
|
|
|
|
// TODO: Move this test to the query API test (making sure we can reject queries
|
|
// when the dimension is incorrect)
|
|
test("two columns with different dimensions", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const schema = new Schema([
|
|
new Field("id", new Int32(), true),
|
|
new Field("vec", new FixedSizeList(32, new Field("item", new Float32()))),
|
|
new Field(
|
|
"vec2",
|
|
new FixedSizeList(64, new Field("item", new Float32())),
|
|
),
|
|
]);
|
|
const tbl = await db.createTable(
|
|
"two_vectors",
|
|
makeArrowTable(
|
|
Array(300)
|
|
.fill(1)
|
|
.map((_, i) => ({
|
|
id: i,
|
|
vec: Array(32)
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
vec2: Array(64) // different dimension
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
})),
|
|
{ schema },
|
|
),
|
|
);
|
|
|
|
// Only build index over v1
|
|
await tbl.createIndex("vec", {
|
|
config: Index.ivfPq({ numPartitions: 2, numSubVectors: 2 }),
|
|
waitTimeoutSeconds: 30,
|
|
});
|
|
|
|
const rst = await tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(
|
|
Array(32)
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
)
|
|
.toArrow();
|
|
expect(rst.numRows).toBe(2);
|
|
|
|
// Search with specifying the column
|
|
await expect(
|
|
tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(
|
|
Array(64)
|
|
.fill(1)
|
|
.map(() => Math.random()),
|
|
)
|
|
.column("vec")
|
|
.toArrow(),
|
|
).rejects.toThrow(
|
|
/.* query dim\(64\) doesn't match the column vec vector dim\(32\).*/,
|
|
);
|
|
|
|
const query64 = Array(64)
|
|
.fill(1)
|
|
.map(() => Math.random());
|
|
const rst64Query = await tbl.query().limit(2).nearestTo(query64).toArrow();
|
|
const rst64Search = await tbl
|
|
.query()
|
|
.limit(2)
|
|
.nearestTo(query64)
|
|
.column("vec2")
|
|
.toArrow();
|
|
expect(rst64Query.toString()).toEqual(rst64Search.toString());
|
|
expect(rst64Query.numRows).toBe(2);
|
|
});
|
|
});
|
|
|
|
describe("When querying a table", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
it("should throw an error when timeout is reached", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = makeArrowTable([
|
|
{ text: "a", vector: [0.1, 0.2] },
|
|
{ text: "b", vector: [0.3, 0.4] },
|
|
]);
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", { config: Index.fts() });
|
|
|
|
await expect(
|
|
table.query().where("text != 'a'").toArray({ timeoutMs: 0 }),
|
|
).rejects.toThrow("Query timeout");
|
|
|
|
await expect(
|
|
table.query().nearestTo([0.0, 0.0]).toArrow({ timeoutMs: 0 }),
|
|
).rejects.toThrow("Query timeout");
|
|
|
|
await expect(
|
|
table.search("a", "fts").toArray({ timeoutMs: 0 }),
|
|
).rejects.toThrow("Query timeout");
|
|
|
|
await expect(
|
|
table
|
|
.query()
|
|
.nearestToText("a")
|
|
.nearestTo([0.0, 0.0])
|
|
.toArrow({ timeoutMs: 0 }),
|
|
).rejects.toThrow("Query timeout");
|
|
});
|
|
});
|
|
|
|
describe("Read consistency interval", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
// const intervals = [undefined, 0, 0.1];
|
|
const intervals = [0];
|
|
test.each(intervals)("read consistency interval %p", async (interval) => {
|
|
const db = await connect(tmpDir.name);
|
|
const table = await db.createTable("my_table", [{ id: 1 }]);
|
|
|
|
const db2 = await connect(tmpDir.name, {
|
|
readConsistencyInterval: interval,
|
|
});
|
|
const table2 = await db2.openTable("my_table");
|
|
expect(await table2.countRows()).toEqual(await table.countRows());
|
|
|
|
await table.add([{ id: 2 }]);
|
|
|
|
if (interval === undefined) {
|
|
expect(await table2.countRows()).toEqual(1);
|
|
// TODO: once we implement time travel we can uncomment this part of the test.
|
|
// await table2.checkout_latest();
|
|
// expect(await table2.countRows()).toEqual(2);
|
|
} else if (interval === 0) {
|
|
expect(await table2.countRows()).toEqual(2);
|
|
} else {
|
|
// interval == 0.1
|
|
expect(await table2.countRows()).toEqual(1);
|
|
await new Promise((r) => setTimeout(r, 100));
|
|
expect(await table2.countRows()).toEqual(2);
|
|
}
|
|
});
|
|
});
|
|
|
|
describe("schema evolution", function () {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
// Create a new sample table
|
|
it("can add a new column to the schema", async function () {
|
|
const con = await connect(tmpDir.name);
|
|
const table = await con.createTable("vectors", [
|
|
{ id: 1n, vector: [0.1, 0.2] },
|
|
]);
|
|
|
|
await table.addColumns([
|
|
{ name: "price", valueSql: "cast(10.0 as float)" },
|
|
]);
|
|
|
|
const expectedSchema = new Schema([
|
|
new Field("id", new Int64(), true),
|
|
new Field(
|
|
"vector",
|
|
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
|
true,
|
|
),
|
|
new Field("price", new Float32(), false),
|
|
]);
|
|
expect(await table.schema()).toEqual(expectedSchema);
|
|
});
|
|
|
|
it("can alter the columns in the schema", async function () {
|
|
const con = await connect(tmpDir.name);
|
|
const schema = new Schema([
|
|
new Field("id", new Int64(), true),
|
|
new Field(
|
|
"vector",
|
|
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
|
true,
|
|
),
|
|
new Field("price", new Float64(), false),
|
|
]);
|
|
const table = await con.createTable("vectors", [
|
|
{ id: 1n, vector: [0.1, 0.2] },
|
|
]);
|
|
// Can create a non-nullable column only through addColumns at the moment.
|
|
const addColumnsRes = await table.addColumns([
|
|
{ name: "price", valueSql: "cast(10.0 as double)" },
|
|
]);
|
|
expect(addColumnsRes).toHaveProperty("version");
|
|
expect(addColumnsRes.version).toBe(2);
|
|
expect(await table.schema()).toEqual(schema);
|
|
|
|
const alterColumnsRes = await table.alterColumns([
|
|
{ path: "id", rename: "new_id" },
|
|
{ path: "price", nullable: true },
|
|
]);
|
|
expect(alterColumnsRes).toHaveProperty("version");
|
|
expect(alterColumnsRes.version).toBe(3);
|
|
|
|
const expectedSchema = new Schema([
|
|
new Field("new_id", new Int64(), true),
|
|
new Field(
|
|
"vector",
|
|
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
|
true,
|
|
),
|
|
new Field("price", new Float64(), true),
|
|
]);
|
|
expect(await table.schema()).toEqual(expectedSchema);
|
|
|
|
await table.alterColumns([{ path: "new_id", dataType: "int32" }]);
|
|
const expectedSchema2 = new Schema([
|
|
new Field("new_id", new Int32(), true),
|
|
new Field(
|
|
"vector",
|
|
new FixedSizeList(2, new Field("item", new Float32(), true)),
|
|
true,
|
|
),
|
|
new Field("price", new Float64(), true),
|
|
]);
|
|
expect(await table.schema()).toEqual(expectedSchema2);
|
|
|
|
await table.alterColumns([
|
|
{
|
|
path: "vector",
|
|
dataType: new FixedSizeList(2, new Field("item", new Float64(), true)),
|
|
},
|
|
]);
|
|
const expectedSchema3 = new Schema([
|
|
new Field("new_id", new Int32(), true),
|
|
new Field(
|
|
"vector",
|
|
new FixedSizeList(2, new Field("item", new Float64(), true)),
|
|
true,
|
|
),
|
|
new Field("price", new Float64(), true),
|
|
]);
|
|
expect(await table.schema()).toEqual(expectedSchema3);
|
|
});
|
|
|
|
it("can cast to various types", async function () {
|
|
const con = await connect(tmpDir.name);
|
|
|
|
// integers
|
|
const intTypes = [
|
|
new arrow.Int8(),
|
|
new arrow.Int16(),
|
|
new arrow.Int32(),
|
|
new arrow.Int64(),
|
|
new arrow.Uint8(),
|
|
new arrow.Uint16(),
|
|
new arrow.Uint32(),
|
|
new arrow.Uint64(),
|
|
];
|
|
const tableInts = await con.createTable("ints", [{ id: 1n }], {
|
|
schema: new Schema([new Field("id", new Int64(), true)]),
|
|
});
|
|
for (const intType of intTypes) {
|
|
await tableInts.alterColumns([{ path: "id", dataType: intType }]);
|
|
const schema = new Schema([new Field("id", intType, true)]);
|
|
expect(await tableInts.schema()).toEqual(schema);
|
|
}
|
|
|
|
// floats
|
|
const floatTypes = [
|
|
new arrow.Float16(),
|
|
new arrow.Float32(),
|
|
new arrow.Float64(),
|
|
];
|
|
const tableFloats = await con.createTable("floats", [{ val: 2.1 }], {
|
|
schema: new Schema([new Field("val", new Float32(), true)]),
|
|
});
|
|
for (const floatType of floatTypes) {
|
|
await tableFloats.alterColumns([{ path: "val", dataType: floatType }]);
|
|
const schema = new Schema([new Field("val", floatType, true)]);
|
|
expect(await tableFloats.schema()).toEqual(schema);
|
|
}
|
|
|
|
// Lists of floats
|
|
const listTypes = [
|
|
new arrow.List(new arrow.Field("item", new arrow.Float32(), true)),
|
|
new arrow.FixedSizeList(
|
|
2,
|
|
new arrow.Field("item", new arrow.Float64(), true),
|
|
),
|
|
new arrow.FixedSizeList(
|
|
2,
|
|
new arrow.Field("item", new arrow.Float16(), true),
|
|
),
|
|
new arrow.FixedSizeList(
|
|
2,
|
|
new arrow.Field("item", new arrow.Float32(), true),
|
|
),
|
|
];
|
|
const tableLists = await con.createTable("lists", [{ val: [2.1, 3.2] }], {
|
|
schema: new Schema([
|
|
new Field(
|
|
"val",
|
|
new FixedSizeList(2, new arrow.Field("item", new Float32())),
|
|
true,
|
|
),
|
|
]),
|
|
});
|
|
for (const listType of listTypes) {
|
|
await tableLists.alterColumns([{ path: "val", dataType: listType }]);
|
|
const schema = new Schema([new Field("val", listType, true)]);
|
|
expect(await tableLists.schema()).toEqual(schema);
|
|
}
|
|
});
|
|
|
|
it("can drop a column from the schema", async function () {
|
|
const con = await connect(tmpDir.name);
|
|
const table = await con.createTable("vectors", [
|
|
{ id: 1n, vector: [0.1, 0.2] },
|
|
]);
|
|
const dropColumnsRes = await table.dropColumns(["vector"]);
|
|
expect(dropColumnsRes).toHaveProperty("version");
|
|
expect(dropColumnsRes.version).toBe(2);
|
|
|
|
const expectedSchema = new Schema([new Field("id", new Int64(), true)]);
|
|
expect(await table.schema()).toEqual(expectedSchema);
|
|
});
|
|
});
|
|
|
|
describe("when dealing with versioning", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
it("can travel in time", async () => {
|
|
// Setup
|
|
const con = await connect(tmpDir.name);
|
|
const table = await con.createTable("vectors", [
|
|
{ id: 1n, vector: [0.1, 0.2] },
|
|
]);
|
|
const version = await table.version();
|
|
await table.add([{ id: 2n, vector: [0.1, 0.2] }]);
|
|
expect(await table.countRows()).toBe(2);
|
|
// Make sure we can rewind
|
|
await table.checkout(version);
|
|
expect(await table.countRows()).toBe(1);
|
|
// Can't add data in time travel mode
|
|
await expect(table.add([{ id: 3n, vector: [0.1, 0.2] }])).rejects.toThrow(
|
|
"table cannot be modified when a specific version is checked out",
|
|
);
|
|
// Can go back to normal mode
|
|
await table.checkoutLatest();
|
|
expect(await table.countRows()).toBe(2);
|
|
// Should be able to add data again
|
|
await table.add([{ id: 2n, vector: [0.1, 0.2] }]);
|
|
expect(await table.countRows()).toBe(3);
|
|
// Now checkout and restore
|
|
await table.checkout(version);
|
|
await table.restore();
|
|
expect(await table.countRows()).toBe(1);
|
|
// Should be able to add data
|
|
await table.add([{ id: 2n, vector: [0.1, 0.2] }]);
|
|
expect(await table.countRows()).toBe(2);
|
|
// Can't use restore if not checked out
|
|
await expect(table.restore()).rejects.toThrow(
|
|
"checkout before running restore",
|
|
);
|
|
});
|
|
});
|
|
|
|
describe("when dealing with tags", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
it("can manage tags", async () => {
|
|
const conn = await connect(tmpDir.name, {
|
|
readConsistencyInterval: 0,
|
|
});
|
|
|
|
const table = await conn.createTable("my_table", [
|
|
{ id: 1n, vector: [0.1, 0.2] },
|
|
]);
|
|
expect(await table.version()).toBe(1);
|
|
|
|
await table.add([{ id: 2n, vector: [0.3, 0.4] }]);
|
|
expect(await table.version()).toBe(2);
|
|
|
|
const tagsManager = await table.tags();
|
|
|
|
const initialTags = await tagsManager.list();
|
|
expect(Object.keys(initialTags).length).toBe(0);
|
|
|
|
const tag1 = "tag1";
|
|
await tagsManager.create(tag1, 1);
|
|
expect(await tagsManager.getVersion(tag1)).toBe(1);
|
|
|
|
const tagsAfterFirst = await tagsManager.list();
|
|
expect(Object.keys(tagsAfterFirst).length).toBe(1);
|
|
expect(tagsAfterFirst).toHaveProperty(tag1);
|
|
expect(tagsAfterFirst[tag1].version).toBe(1);
|
|
|
|
await tagsManager.create("tag2", 2);
|
|
expect(await tagsManager.getVersion("tag2")).toBe(2);
|
|
|
|
const tagsAfterSecond = await tagsManager.list();
|
|
expect(Object.keys(tagsAfterSecond).length).toBe(2);
|
|
expect(tagsAfterSecond).toHaveProperty(tag1);
|
|
expect(tagsAfterSecond[tag1].version).toBe(1);
|
|
expect(tagsAfterSecond).toHaveProperty("tag2");
|
|
expect(tagsAfterSecond["tag2"].version).toBe(2);
|
|
|
|
await table.add([{ id: 3n, vector: [0.5, 0.6] }]);
|
|
await tagsManager.update(tag1, 3);
|
|
expect(await tagsManager.getVersion(tag1)).toBe(3);
|
|
|
|
await tagsManager.delete("tag2");
|
|
const tagsAfterDelete = await tagsManager.list();
|
|
expect(Object.keys(tagsAfterDelete).length).toBe(1);
|
|
expect(tagsAfterDelete).toHaveProperty(tag1);
|
|
expect(tagsAfterDelete[tag1].version).toBe(3);
|
|
|
|
await table.add([{ id: 4n, vector: [0.7, 0.8] }]);
|
|
expect(await table.version()).toBe(4);
|
|
|
|
await table.checkout(tag1);
|
|
expect(await table.version()).toBe(3);
|
|
|
|
await table.checkoutLatest();
|
|
expect(await table.version()).toBe(4);
|
|
});
|
|
|
|
it("can checkout and restore tags", async () => {
|
|
const conn = await connect(tmpDir.name, {
|
|
readConsistencyInterval: 0,
|
|
});
|
|
|
|
const table = await conn.createTable("my_table", [
|
|
{ id: 1n, vector: [0.1, 0.2] },
|
|
]);
|
|
expect(await table.version()).toBe(1);
|
|
expect(await table.countRows()).toBe(1);
|
|
const tagsManager = await table.tags();
|
|
const tag1 = "tag1";
|
|
await tagsManager.create(tag1, 1);
|
|
await table.add([{ id: 2n, vector: [0.3, 0.4] }]);
|
|
const tag2 = "tag2";
|
|
await tagsManager.create(tag2, 2);
|
|
expect(await table.version()).toBe(2);
|
|
await table.checkout(tag1);
|
|
expect(await table.version()).toBe(1);
|
|
await table.restore();
|
|
expect(await table.version()).toBe(3);
|
|
expect(await table.countRows()).toBe(1);
|
|
await table.add([{ id: 3n, vector: [0.5, 0.6] }]);
|
|
expect(await table.countRows()).toBe(2);
|
|
});
|
|
});
|
|
|
|
describe("when optimizing a dataset", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let table: Table;
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const con = await connect(tmpDir.name);
|
|
table = await con.createTable("vectors", [{ id: 1 }]);
|
|
await table.add([{ id: 2 }]);
|
|
});
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
it("compacts files", async () => {
|
|
const stats = await table.optimize();
|
|
expect(stats.compaction.filesAdded).toBe(1);
|
|
expect(stats.compaction.filesRemoved).toBe(2);
|
|
expect(stats.compaction.fragmentsAdded).toBe(1);
|
|
expect(stats.compaction.fragmentsRemoved).toBe(2);
|
|
});
|
|
|
|
it("cleanups old versions", async () => {
|
|
const stats = await table.optimize({ cleanupOlderThan: new Date() });
|
|
expect(stats.prune.bytesRemoved).toBeGreaterThan(0);
|
|
expect(stats.prune.oldVersionsRemoved).toBe(3);
|
|
});
|
|
|
|
it("delete unverified", async () => {
|
|
const version = await table.version();
|
|
const versionFile = `${tmpDir.name}/${table.name}.lance/_versions/${
|
|
version - 1
|
|
}.manifest`;
|
|
fs.rmSync(versionFile);
|
|
|
|
let stats = await table.optimize({ deleteUnverified: false });
|
|
expect(stats.prune.oldVersionsRemoved).toBe(0);
|
|
|
|
stats = await table.optimize({
|
|
cleanupOlderThan: new Date(),
|
|
deleteUnverified: true,
|
|
});
|
|
expect(stats.prune.oldVersionsRemoved).toBeGreaterThan(1);
|
|
});
|
|
});
|
|
|
|
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
|
"when optimizing a dataset",
|
|
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
|
|
(arrow: any) => {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
getRegistry().reset();
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
test("can search using a string", async () => {
|
|
@register()
|
|
class MockEmbeddingFunction extends EmbeddingFunction<string> {
|
|
ndims() {
|
|
return 1;
|
|
}
|
|
embeddingDataType() {
|
|
return new Float32();
|
|
}
|
|
|
|
// Hardcoded embeddings for the sake of testing
|
|
async computeQueryEmbeddings(_data: string) {
|
|
switch (_data) {
|
|
case "greetings":
|
|
return [0.1];
|
|
case "farewell":
|
|
return [0.2];
|
|
default:
|
|
return null as never;
|
|
}
|
|
}
|
|
|
|
// Hardcoded embeddings for the sake of testing
|
|
async computeSourceEmbeddings(data: string[]) {
|
|
return data.map((s) => {
|
|
switch (s) {
|
|
case "hello world":
|
|
return [0.1];
|
|
case "goodbye world":
|
|
return [0.2];
|
|
default:
|
|
return null as never;
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
const func = new MockEmbeddingFunction();
|
|
const schema = LanceSchema({
|
|
text: func.sourceField(new arrow.Utf8()),
|
|
vector: func.vectorField(),
|
|
});
|
|
const db = await connect(tmpDir.name);
|
|
const data = [{ text: "hello world" }, { text: "goodbye world" }];
|
|
const table = await db.createTable("test", data, { schema });
|
|
|
|
const results = await table.search("greetings").toArray();
|
|
expect(results[0].text).toBe(data[0].text);
|
|
|
|
const results2 = await table.search("farewell").toArray();
|
|
expect(results2[0].text).toBe(data[1].text);
|
|
});
|
|
|
|
test("rejects if no embedding function provided", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
|
|
expect(table.search("hello", "vector").toArray()).rejects.toThrow(
|
|
"No embedding functions are defined in the table",
|
|
);
|
|
});
|
|
|
|
test("full text search if no embedding function provided", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts(),
|
|
});
|
|
|
|
const results = await table.search("hello").toArray();
|
|
expect(results[0].text).toBe(data[0].text);
|
|
|
|
const query = new MatchQuery("goodbye", "text");
|
|
expect(instanceOfFullTextQuery(query)).toBe(true);
|
|
const results2 = await table
|
|
.search(new MatchQuery("goodbye", "text"))
|
|
.toArray();
|
|
expect(results2[0].text).toBe(data[1].text);
|
|
});
|
|
|
|
test("prewarm full text search index", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] },
|
|
{ text: ["lance database"], vector: [0.4, 0.5, 0.6] },
|
|
{ text: ["lance", "search"], vector: [0.7, 0.8, 0.9] },
|
|
{ text: ["database", "search"], vector: [1.0, 1.1, 1.2] },
|
|
{ text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts(),
|
|
});
|
|
|
|
// For the moment, we just confirm we can call prewarmIndex without error
|
|
// and still search it afterwards
|
|
await table.prewarmIndex("text_idx");
|
|
|
|
const results = await table.search("lance").toArray();
|
|
expect(results.length).toBe(3);
|
|
});
|
|
|
|
test("full text index on list", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: ["lance database", "the", "search"], vector: [0.1, 0.2, 0.3] },
|
|
{ text: ["lance database"], vector: [0.4, 0.5, 0.6] },
|
|
{ text: ["lance", "search"], vector: [0.7, 0.8, 0.9] },
|
|
{ text: ["database", "search"], vector: [1.0, 1.1, 1.2] },
|
|
{ text: ["unrelated", "doc"], vector: [1.3, 1.4, 1.5] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts({
|
|
withPosition: true,
|
|
}),
|
|
});
|
|
|
|
const results = await table.search("lance").toArray();
|
|
expect(results.length).toBe(3);
|
|
|
|
const results2 = await table.search('"lance database"').toArray();
|
|
expect(results2.length).toBe(2);
|
|
});
|
|
|
|
test("full text search without positions", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts({ withPosition: false }),
|
|
});
|
|
|
|
const results = await table.search("hello").toArray();
|
|
expect(results[0].text).toBe(data[0].text);
|
|
|
|
const results2 = await table
|
|
.search(new MatchQuery("hello world", "text"))
|
|
.toArray();
|
|
expect(results2.length).toBe(2);
|
|
|
|
const results3 = await table
|
|
.search(
|
|
new MatchQuery("hello world", "text", { operator: Operator.And }),
|
|
)
|
|
.toArray();
|
|
expect(results3.length).toBe(1);
|
|
});
|
|
|
|
test("full text search without lowercase", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "Hello World", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts({ withPosition: false }),
|
|
});
|
|
const results = await table.search("hello").toArray();
|
|
expect(results.length).toBe(2);
|
|
|
|
await table.createIndex("text", {
|
|
config: Index.fts({ withPosition: false, lowercase: false }),
|
|
});
|
|
const results2 = await table.search("hello").toArray();
|
|
expect(results2.length).toBe(1);
|
|
});
|
|
|
|
test("full text search phrase query", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts({
|
|
withPosition: true,
|
|
}),
|
|
});
|
|
|
|
const results = await table.search("world").toArray();
|
|
expect(results.length).toBe(2);
|
|
const phraseResults = await table.search('"hello world"').toArray();
|
|
expect(phraseResults.length).toBe(1);
|
|
const phraseResults2 = await table
|
|
.search(new PhraseQuery("hello world", "text"))
|
|
.toArray();
|
|
expect(phraseResults2.length).toBe(1);
|
|
});
|
|
|
|
test("full text search fuzzy query", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "fa", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "fo", vector: [0.4, 0.5, 0.6] },
|
|
{ text: "fob", vector: [0.4, 0.5, 0.6] },
|
|
{ text: "focus", vector: [0.4, 0.5, 0.6] },
|
|
{ text: "foo", vector: [0.4, 0.5, 0.6] },
|
|
{ text: "food", vector: [0.4, 0.5, 0.6] },
|
|
{ text: "foul", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts(),
|
|
});
|
|
|
|
const results = await table
|
|
.search(new MatchQuery("foo", "text"))
|
|
.toArray();
|
|
expect(results.length).toBe(1);
|
|
expect(results[0].text).toBe("foo");
|
|
|
|
const fuzzyResults = await table
|
|
.search(new MatchQuery("foo", "text", { fuzziness: 1 }))
|
|
.toArray();
|
|
expect(fuzzyResults.length).toBe(4);
|
|
const resultSet = new Set(fuzzyResults.map((r) => r.text));
|
|
expect(resultSet.has("foo")).toBe(true);
|
|
expect(resultSet.has("fob")).toBe(true);
|
|
expect(resultSet.has("fo")).toBe(true);
|
|
expect(resultSet.has("food")).toBe(true);
|
|
|
|
const prefixResults = await table
|
|
.search(
|
|
new MatchQuery("foo", "text", { fuzziness: 3, prefixLength: 3 }),
|
|
)
|
|
.toArray();
|
|
expect(prefixResults.length).toBe(2);
|
|
const resultSet2 = new Set(prefixResults.map((r) => r.text));
|
|
expect(resultSet2.has("foo")).toBe(true);
|
|
expect(resultSet2.has("food")).toBe(true);
|
|
});
|
|
|
|
test("full text search boolean query", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "The cat and dog are playing" },
|
|
{ text: "The cat is sleeping" },
|
|
{ text: "The dog is barking" },
|
|
{ text: "The dog chases the cat" },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts({ withPosition: false }),
|
|
});
|
|
|
|
const shouldResults = await table
|
|
.search(
|
|
new BooleanQuery([
|
|
[Occur.Should, new MatchQuery("cat", "text")],
|
|
[Occur.Should, new MatchQuery("dog", "text")],
|
|
]),
|
|
)
|
|
.toArray();
|
|
expect(shouldResults.length).toBe(4);
|
|
|
|
const mustResults = await table
|
|
.search(
|
|
new BooleanQuery([
|
|
[Occur.Must, new MatchQuery("cat", "text")],
|
|
[Occur.Must, new MatchQuery("dog", "text")],
|
|
]),
|
|
)
|
|
.toArray();
|
|
expect(mustResults.length).toBe(2);
|
|
|
|
const mustNotResults = await table
|
|
.search(
|
|
new BooleanQuery([
|
|
[Occur.Must, new MatchQuery("cat", "text")],
|
|
[Occur.MustNot, new MatchQuery("dog", "text")],
|
|
]),
|
|
)
|
|
.toArray();
|
|
expect(mustNotResults.length).toBe(1);
|
|
});
|
|
|
|
test("full text search ngram", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
|
|
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
await table.createIndex("text", {
|
|
config: Index.fts({ baseTokenizer: "ngram" }),
|
|
});
|
|
|
|
const results = await table.search("lan").toArray();
|
|
expect(results.length).toBe(2);
|
|
const resultSet = new Set(results.map((r) => r.text));
|
|
expect(resultSet.has("lance database")).toBe(true);
|
|
expect(resultSet.has("lance is cool")).toBe(true);
|
|
|
|
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
|
|
expect(results2.length).toBe(2);
|
|
const resultSet2 = new Set(results2.map((r) => r.text));
|
|
expect(resultSet2.has("lance database")).toBe(true);
|
|
expect(resultSet2.has("lance is cool")).toBe(true);
|
|
|
|
// the default min_ngram_length is 3, so "la" should not match
|
|
const results3 = await table.search("la").toArray();
|
|
expect(results3.length).toBe(0);
|
|
|
|
// test setting min_ngram_length and prefix_only
|
|
await table.createIndex("text", {
|
|
config: Index.fts({
|
|
baseTokenizer: "ngram",
|
|
ngramMinLength: 2,
|
|
prefixOnly: true,
|
|
}),
|
|
replace: true,
|
|
});
|
|
|
|
const results4 = await table.search("lan").toArray();
|
|
expect(results4.length).toBe(2);
|
|
const resultSet4 = new Set(results4.map((r) => r.text));
|
|
expect(resultSet4.has("lance database")).toBe(true);
|
|
expect(resultSet4.has("lance is cool")).toBe(true);
|
|
|
|
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
|
|
expect(results5.length).toBe(0);
|
|
|
|
const results6 = await table.search("la").toArray();
|
|
expect(results6.length).toBe(2);
|
|
const resultSet6 = new Set(results6.map((r) => r.text));
|
|
expect(resultSet6.has("lance database")).toBe(true);
|
|
expect(resultSet6.has("lance is cool")).toBe(true);
|
|
});
|
|
|
|
test.each([
|
|
[0.4, 0.5, 0.599], // number[]
|
|
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
|
Float64Array.of(0.4, 0.5, 0.599), // Float64Array
|
|
])("can search using vectorlike datatypes", async (vectorlike) => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [
|
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
|
];
|
|
const table = await db.createTable("test", data);
|
|
|
|
// biome-ignore lint/suspicious/noExplicitAny: test
|
|
const results: any[] = await table.search(vectorlike).toArray();
|
|
|
|
expect(results.length).toBe(2);
|
|
expect(results[0].text).toBe(data[1].text);
|
|
});
|
|
},
|
|
);
|
|
|
|
describe("when calling explainPlan", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let table: Table;
|
|
let queryVec: number[];
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const con = await connect(tmpDir.name);
|
|
table = await con.createTable("vectors", [{ id: 1, vector: [0.1, 0.2] }]);
|
|
});
|
|
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
it("retrieves query plan", async () => {
|
|
queryVec = Array(2)
|
|
.fill(1)
|
|
.map(() => Math.random());
|
|
const plan = await table.query().nearestTo(queryVec).explainPlan(true);
|
|
|
|
expect(plan).toMatch("KNN");
|
|
});
|
|
});
|
|
|
|
describe("when calling analyzePlan", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let table: Table;
|
|
let queryVec: number[];
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const con = await connect(tmpDir.name);
|
|
table = await con.createTable("vectors", [{ id: 1, vector: [1.1, 0.9] }]);
|
|
});
|
|
|
|
afterEach(() => {
|
|
tmpDir.removeCallback();
|
|
});
|
|
|
|
it("retrieves runtime metrics", async () => {
|
|
queryVec = Array(2)
|
|
.fill(1)
|
|
.map(() => Math.random());
|
|
const plan = await table.query().nearestTo(queryVec).analyzePlan();
|
|
console.log("Query Plan:\n", plan); // <--- Print the plan
|
|
expect(plan).toMatch("AnalyzeExec");
|
|
});
|
|
});
|
|
|
|
describe("column name options", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let table: Table;
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
const con = await connect(tmpDir.name);
|
|
table = await con.createTable("vectors", [
|
|
{ camelCase: 1, vector: [0.1, 0.2] },
|
|
]);
|
|
});
|
|
|
|
test("can select columns with different names", async () => {
|
|
const results = await table.query().select(["camelCase"]).toArray();
|
|
expect(results[0].camelCase).toBe(1);
|
|
});
|
|
|
|
test("can filter on columns with different names", async () => {
|
|
const results = await table.query().where("`camelCase` = 1").toArray();
|
|
expect(results[0].camelCase).toBe(1);
|
|
});
|
|
|
|
test("can make multiple vector queries in one go", async () => {
|
|
const results = await table
|
|
.query()
|
|
.nearestTo([0.1, 0.2])
|
|
.addQueryVector([0.1, 0.2])
|
|
.limit(1)
|
|
.toArray();
|
|
console.log(results);
|
|
expect(results.length).toBe(2);
|
|
results.sort((a, b) => a.query_index - b.query_index);
|
|
expect(results[0].query_index).toBe(0);
|
|
expect(results[1].query_index).toBe(1);
|
|
});
|
|
|
|
test("index and search multivectors", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [];
|
|
// generate 512 random multivectors
|
|
for (let i = 0; i < 256; i++) {
|
|
data.push({
|
|
multivector: Array.from({ length: 10 }, () =>
|
|
Array(2).fill(Math.random()),
|
|
),
|
|
});
|
|
}
|
|
const table = await db.createTable("multivectors", data, {
|
|
schema: new Schema([
|
|
new Field(
|
|
"multivector",
|
|
new List(
|
|
new Field(
|
|
"item",
|
|
new FixedSizeList(2, new Field("item", new Float32())),
|
|
),
|
|
),
|
|
),
|
|
]),
|
|
});
|
|
|
|
const results = await table.search(data[0].multivector).limit(10).toArray();
|
|
expect(results.length).toBe(10);
|
|
|
|
await table.createIndex("multivector", {
|
|
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
|
|
});
|
|
|
|
const results2 = await table
|
|
.search(data[0].multivector)
|
|
.limit(10)
|
|
.toArray();
|
|
expect(results2.length).toBe(10);
|
|
});
|
|
});
|
|
|
|
describe("when creating an empty table", () => {
|
|
let con: Connection;
|
|
beforeEach(async () => {
|
|
const tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
con = await connect(tmpDir.name);
|
|
});
|
|
afterEach(() => {
|
|
con.close();
|
|
});
|
|
|
|
it("can create an empty table from an arrow Schema", async () => {
|
|
const schema = new Schema([
|
|
new Field("id", new Int64()),
|
|
new Field("vector", new Float64()),
|
|
]);
|
|
const table = await con.createEmptyTable("test", schema);
|
|
const actualSchema = await table.schema();
|
|
expect(actualSchema.fields[0].type.typeId).toBe(Type.Int);
|
|
expect((actualSchema.fields[0].type as Int64).bitWidth).toBe(64);
|
|
expect(actualSchema.fields[1].type.typeId).toBe(Type.Float);
|
|
expect((actualSchema.fields[1].type as Float64).precision).toBe(2);
|
|
});
|
|
|
|
it("can create an empty table from schema that specifies field types by name", async () => {
|
|
const schemaLike = {
|
|
fields: [
|
|
{
|
|
name: "id",
|
|
type: "int64",
|
|
nullable: true,
|
|
},
|
|
{
|
|
name: "vector",
|
|
type: "float64",
|
|
nullable: true,
|
|
},
|
|
],
|
|
metadata: new Map(),
|
|
names: ["id", "vector"],
|
|
} satisfies SchemaLike;
|
|
const table = await con.createEmptyTable("test", schemaLike);
|
|
const actualSchema = await table.schema();
|
|
expect(actualSchema.fields[0].type.typeId).toBe(Type.Int);
|
|
expect((actualSchema.fields[0].type as Int64).bitWidth).toBe(64);
|
|
expect(actualSchema.fields[1].type.typeId).toBe(Type.Float);
|
|
expect((actualSchema.fields[1].type as Float64).precision).toBe(2);
|
|
});
|
|
});
|