// Copyright 2024 Lance Developers. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. import * as fs from "fs"; import * as path from "path"; import * as tmp from "tmp"; import * as arrow from "apache-arrow"; import * as arrowOld from "apache-arrow-old"; import { Table, connect } from "../lancedb"; import { Table as ArrowTable, Field, FixedSizeList, Float32, Float64, Int32, Int64, Schema, makeArrowTable, } from "../lancedb/arrow"; import { EmbeddingFunction, LanceSchema, register } from "../lancedb/embedding"; import { Index } from "../lancedb/indices"; // biome-ignore lint/suspicious/noExplicitAny: describe.each([arrow, arrowOld])("Given a table", (arrow: any) => { let tmpDir: tmp.DirResult; let table: Table; const schema = new arrow.Schema([ new arrow.Field("id", new arrow.Float64(), true), ]); beforeEach(async () => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); const conn = await connect(tmpDir.name); table = await conn.createEmptyTable("some_table", schema); }); afterEach(() => tmpDir.removeCallback()); it("be displayable", async () => { expect(table.display()).toMatch( /NativeTable\(some_table, uri=.*, read_consistency_interval=None\)/, ); table.close(); expect(table.display()).toBe("ClosedTable(some_table)"); }); it("should let me add data", async () => { await table.add([{ id: 1 }, { id: 2 }]); await table.add([{ id: 1 }]); await expect(table.countRows()).resolves.toBe(3); }); it("should overwrite data if asked", async () => { await table.add([{ id: 1 }, { id: 2 }]); await table.add([{ id: 1 }], { mode: "overwrite" }); await expect(table.countRows()).resolves.toBe(1); }); it("should let me close the table", async () => { expect(table.isOpen()).toBe(true); table.close(); expect(table.isOpen()).toBe(false); expect(table.countRows()).rejects.toThrow("Table some_table is closed"); }); it("should let me update values", async () => { await table.add([{ id: 1 }]); expect(await table.countRows("id == 1")).toBe(1); expect(await table.countRows("id == 7")).toBe(0); await table.update({ id: "7" }); expect(await table.countRows("id == 1")).toBe(0); expect(await table.countRows("id == 7")).toBe(1); await table.add([{ id: 2 }]); // Test Map as input await table.update(new Map(Object.entries({ id: "10" })), { where: "id % 2 == 0", }); expect(await table.countRows("id == 2")).toBe(0); expect(await table.countRows("id == 7")).toBe(1); expect(await table.countRows("id == 10")).toBe(1); }); // https://github.com/lancedb/lancedb/issues/1293 test.each([new arrow.Float16(), new arrow.Float32(), new arrow.Float64()])( "can create empty table with non default float type: %s", async (floatType) => { const db = await connect(tmpDir.name); const data = [ { text: "hello", vector: Array(512).fill(1.0) }, { text: "hello world", vector: Array(512).fill(1.0) }, ]; const f64Schema = new arrow.Schema([ new arrow.Field("text", new arrow.Utf8(), true), new arrow.Field( "vector", new arrow.FixedSizeList(512, new arrow.Field("item", floatType)), true, ), ]); const f64Table = await db.createEmptyTable("f64", f64Schema, { mode: "overwrite", }); try { await f64Table.add(data); const res = await f64Table.query().toArray(); expect(res.length).toBe(2); } catch (e) { expect(e).toBeUndefined(); } }, ); it("should return the table as an instance of an arrow table", async () => { const arrowTbl = await table.toArrow(); expect(arrowTbl).toBeInstanceOf(ArrowTable); }); }); describe("merge insert", () => { let tmpDir: tmp.DirResult; let table: Table; beforeEach(async () => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); const conn = await connect(tmpDir.name); table = await conn.createTable("some_table", [ { a: 1, b: "a" }, { a: 2, b: "b" }, { a: 3, b: "c" }, ]); }); afterEach(() => tmpDir.removeCallback()); test("upsert", async () => { const newData = [ { a: 2, b: "x" }, { a: 3, b: "y" }, { a: 4, b: "z" }, ]; await table .mergeInsert("a") .whenMatchedUpdateAll() .whenNotMatchedInsertAll() .execute(newData); const expected = [ { a: 1, b: "a" }, { a: 2, b: "x" }, { a: 3, b: "y" }, { a: 4, b: "z" }, ]; expect( JSON.parse(JSON.stringify((await table.toArrow()).toArray())), ).toEqual(expected); }); test("conditional update", async () => { const newData = [ { a: 2, b: "x" }, { a: 3, b: "y" }, { a: 4, b: "z" }, ]; await table .mergeInsert("a") .whenMatchedUpdateAll({ where: "target.b = 'b'" }) .execute(newData); const expected = [ { a: 1, b: "a" }, { a: 2, b: "x" }, { a: 3, b: "c" }, ]; // round trip to arrow and back to json to avoid comparing arrow objects to js object // biome-ignore lint/suspicious/noExplicitAny: test let res: any[] = JSON.parse( JSON.stringify((await table.toArrow()).toArray()), ); res = res.sort((a, b) => a.a - b.a); expect(res).toEqual(expected); }); test("insert if not exists", async () => { const newData = [ { a: 2, b: "x" }, { a: 3, b: "y" }, { a: 4, b: "z" }, ]; await table.mergeInsert("a").whenNotMatchedInsertAll().execute(newData); const expected = [ { a: 1, b: "a" }, { a: 2, b: "b" }, { a: 3, b: "c" }, { a: 4, b: "z" }, ]; // biome-ignore lint/suspicious/noExplicitAny: let res: any[] = JSON.parse( JSON.stringify((await table.toArrow()).toArray()), ); res = res.sort((a, b) => a.a - b.a); expect(res).toEqual(expected); }); test("replace range", async () => { const newData = [ { a: 2, b: "x" }, { a: 4, b: "z" }, ]; await table .mergeInsert("a") .whenMatchedUpdateAll() .whenNotMatchedInsertAll() .whenNotMatchedBySourceDelete({ where: "a > 2" }) .execute(newData); const expected = [ { a: 1, b: "a" }, { a: 2, b: "x" }, { a: 4, b: "z" }, ]; // biome-ignore lint/suspicious/noExplicitAny: let res: any[] = JSON.parse( JSON.stringify((await table.toArrow()).toArray()), ); res = res.sort((a, b) => a.a - b.a); expect(res).toEqual(expected); }); test("replace range no condition", async () => { const newData = [ { a: 2, b: "x" }, { a: 4, b: "z" }, ]; await table .mergeInsert("a") .whenMatchedUpdateAll() .whenNotMatchedInsertAll() .whenNotMatchedBySourceDelete() .execute(newData); const expected = [ { a: 2, b: "x" }, { a: 4, b: "z" }, ]; // biome-ignore lint/suspicious/noExplicitAny: test let res: any[] = JSON.parse( JSON.stringify((await table.toArrow()).toArray()), ); res = res.sort((a, b) => a.a - b.a); expect(res).toEqual(expected); }); }); describe("When creating an index", () => { let tmpDir: tmp.DirResult; const schema = new Schema([ new Field("id", new Int32(), true), new Field("vec", new FixedSizeList(32, new Field("item", new Float32()))), ]); let tbl: Table; let queryVec: number[]; beforeEach(async () => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); const db = await connect(tmpDir.name); const data = makeArrowTable( Array(300) .fill(1) .map((_, i) => ({ id: i, vec: Array(32) .fill(1) .map(() => Math.random()), })), { schema, }, ); queryVec = data.toArray()[5].vec.toJSON(); tbl = await db.createTable("test", data); }); afterEach(() => tmpDir.removeCallback()); it("should create a vector index on vector columns", async () => { await tbl.createIndex("vec"); // check index directory const indexDir = path.join(tmpDir.name, "test.lance", "_indices"); expect(fs.readdirSync(indexDir)).toHaveLength(1); const indices = await tbl.listIndices(); expect(indices.length).toBe(1); expect(indices[0]).toEqual({ name: "vec_idx", indexType: "IvfPq", columns: ["vec"], }); // Search without specifying the column let rst = await tbl .query() .limit(2) .nearestTo(queryVec) .distanceType("DoT") .toArrow(); expect(rst.numRows).toBe(2); // Search using `vectorSearch` rst = await tbl.vectorSearch(queryVec).limit(2).toArrow(); expect(rst.numRows).toBe(2); // Search with specifying the column const rst2 = await tbl .query() .limit(2) .nearestTo(queryVec) .column("vec") .toArrow(); expect(rst2.numRows).toBe(2); expect(rst.toString()).toEqual(rst2.toString()); }); it("should allow parameters to be specified", async () => { await tbl.createIndex("vec", { config: Index.ivfPq({ numPartitions: 10, }), }); // TODO: Verify parameters when we can load index config as part of list indices }); it("should allow me to replace (or not) an existing index", async () => { await tbl.createIndex("id"); // Default is replace=true await tbl.createIndex("id"); await expect(tbl.createIndex("id", { replace: false })).rejects.toThrow( "already exists", ); await tbl.createIndex("id", { replace: true }); }); test("should create a scalar index on scalar columns", async () => { await tbl.createIndex("id"); const indexDir = path.join(tmpDir.name, "test.lance", "_indices"); expect(fs.readdirSync(indexDir)).toHaveLength(1); for await (const r of tbl.query().where("id > 1").select(["id"])) { expect(r.numRows).toBe(298); } // should also work with 'filter' alias for await (const r of tbl.query().filter("id > 1").select(["id"])) { expect(r.numRows).toBe(298); } }); test("should be able to get index stats", async () => { await tbl.createIndex("id"); const stats = await tbl.indexStats("id_idx"); expect(stats).toBeDefined(); expect(stats?.numIndexedRows).toEqual(300); expect(stats?.numUnindexedRows).toEqual(0); }); test("when getting stats on non-existent index", async () => { const stats = await tbl.indexStats("some non-existent index"); expect(stats).toBeUndefined(); }); // TODO: Move this test to the query API test (making sure we can reject queries // when the dimension is incorrect) test("two columns with different dimensions", async () => { const db = await connect(tmpDir.name); const schema = new Schema([ new Field("id", new Int32(), true), new Field("vec", new FixedSizeList(32, new Field("item", new Float32()))), new Field( "vec2", new FixedSizeList(64, new Field("item", new Float32())), ), ]); const tbl = await db.createTable( "two_vectors", makeArrowTable( Array(300) .fill(1) .map((_, i) => ({ id: i, vec: Array(32) .fill(1) .map(() => Math.random()), vec2: Array(64) // different dimension .fill(1) .map(() => Math.random()), })), { schema }, ), ); // Only build index over v1 await tbl.createIndex("vec", { config: Index.ivfPq({ numPartitions: 2, numSubVectors: 2 }), }); const rst = await tbl .query() .limit(2) .nearestTo( Array(32) .fill(1) .map(() => Math.random()), ) .toArrow(); expect(rst.numRows).toBe(2); // Search with specifying the column await expect( tbl .query() .limit(2) .nearestTo( Array(64) .fill(1) .map(() => Math.random()), ) .column("vec") .toArrow(), ).rejects.toThrow(/.* query dim=64, expected vector dim=32.*/); const query64 = Array(64) .fill(1) .map(() => Math.random()); const rst64Query = await tbl.query().limit(2).nearestTo(query64).toArrow(); const rst64Search = await tbl .query() .limit(2) .nearestTo(query64) .column("vec2") .toArrow(); expect(rst64Query.toString()).toEqual(rst64Search.toString()); expect(rst64Query.numRows).toBe(2); }); }); describe("Read consistency interval", () => { let tmpDir: tmp.DirResult; beforeEach(() => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); }); afterEach(() => tmpDir.removeCallback()); // const intervals = [undefined, 0, 0.1]; const intervals = [0]; test.each(intervals)("read consistency interval %p", async (interval) => { const db = await connect(tmpDir.name); const table = await db.createTable("my_table", [{ id: 1 }]); const db2 = await connect(tmpDir.name, { readConsistencyInterval: interval, }); const table2 = await db2.openTable("my_table"); expect(await table2.countRows()).toEqual(await table.countRows()); await table.add([{ id: 2 }]); if (interval === undefined) { expect(await table2.countRows()).toEqual(1); // TODO: once we implement time travel we can uncomment this part of the test. // await table2.checkout_latest(); // expect(await table2.countRows()).toEqual(2); } else if (interval === 0) { expect(await table2.countRows()).toEqual(2); } else { // interval == 0.1 expect(await table2.countRows()).toEqual(1); await new Promise((r) => setTimeout(r, 100)); expect(await table2.countRows()).toEqual(2); } }); }); describe("schema evolution", function () { let tmpDir: tmp.DirResult; beforeEach(() => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); }); afterEach(() => { tmpDir.removeCallback(); }); // Create a new sample table it("can add a new column to the schema", async function () { const con = await connect(tmpDir.name); const table = await con.createTable("vectors", [ { id: 1n, vector: [0.1, 0.2] }, ]); await table.addColumns([ { name: "price", valueSql: "cast(10.0 as float)" }, ]); const expectedSchema = new Schema([ new Field("id", new Int64(), true), new Field( "vector", new FixedSizeList(2, new Field("item", new Float32(), true)), true, ), new Field("price", new Float32(), false), ]); expect(await table.schema()).toEqual(expectedSchema); }); it("can alter the columns in the schema", async function () { const con = await connect(tmpDir.name); const schema = new Schema([ new Field("id", new Int64(), true), new Field( "vector", new FixedSizeList(2, new Field("item", new Float32(), true)), true, ), new Field("price", new Float64(), false), ]); const table = await con.createTable("vectors", [ { id: 1n, vector: [0.1, 0.2] }, ]); // Can create a non-nullable column only through addColumns at the moment. await table.addColumns([ { name: "price", valueSql: "cast(10.0 as double)" }, ]); expect(await table.schema()).toEqual(schema); await table.alterColumns([ { path: "id", rename: "new_id" }, { path: "price", nullable: true }, ]); const expectedSchema = new Schema([ new Field("new_id", new Int64(), true), new Field( "vector", new FixedSizeList(2, new Field("item", new Float32(), true)), true, ), new Field("price", new Float64(), true), ]); expect(await table.schema()).toEqual(expectedSchema); }); it("can drop a column from the schema", async function () { const con = await connect(tmpDir.name); const table = await con.createTable("vectors", [ { id: 1n, vector: [0.1, 0.2] }, ]); await table.dropColumns(["vector"]); const expectedSchema = new Schema([new Field("id", new Int64(), true)]); expect(await table.schema()).toEqual(expectedSchema); }); }); describe("when dealing with versioning", () => { let tmpDir: tmp.DirResult; beforeEach(() => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); }); afterEach(() => { tmpDir.removeCallback(); }); it("can travel in time", async () => { // Setup const con = await connect(tmpDir.name); const table = await con.createTable("vectors", [ { id: 1n, vector: [0.1, 0.2] }, ]); const version = await table.version(); await table.add([{ id: 2n, vector: [0.1, 0.2] }]); expect(await table.countRows()).toBe(2); // Make sure we can rewind await table.checkout(version); expect(await table.countRows()).toBe(1); // Can't add data in time travel mode await expect(table.add([{ id: 3n, vector: [0.1, 0.2] }])).rejects.toThrow( "table cannot be modified when a specific version is checked out", ); // Can go back to normal mode await table.checkoutLatest(); expect(await table.countRows()).toBe(2); // Should be able to add data again await table.add([{ id: 2n, vector: [0.1, 0.2] }]); expect(await table.countRows()).toBe(3); // Now checkout and restore await table.checkout(version); await table.restore(); expect(await table.countRows()).toBe(1); // Should be able to add data await table.add([{ id: 2n, vector: [0.1, 0.2] }]); expect(await table.countRows()).toBe(2); // Can't use restore if not checked out await expect(table.restore()).rejects.toThrow( "checkout before running restore", ); }); }); describe("when optimizing a dataset", () => { let tmpDir: tmp.DirResult; let table: Table; beforeEach(async () => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); const con = await connect(tmpDir.name); table = await con.createTable("vectors", [{ id: 1 }]); await table.add([{ id: 2 }]); }); afterEach(() => { tmpDir.removeCallback(); }); it("compacts files", async () => { const stats = await table.optimize(); expect(stats.compaction.filesAdded).toBe(1); expect(stats.compaction.filesRemoved).toBe(2); expect(stats.compaction.fragmentsAdded).toBe(1); expect(stats.compaction.fragmentsRemoved).toBe(2); }); it("cleanups old versions", async () => { const stats = await table.optimize({ cleanupOlderThan: new Date() }); expect(stats.prune.bytesRemoved).toBeGreaterThan(0); expect(stats.prune.oldVersionsRemoved).toBe(3); }); }); describe("table.search", () => { let tmpDir: tmp.DirResult; beforeEach(() => { tmpDir = tmp.dirSync({ unsafeCleanup: true }); }); afterEach(() => tmpDir.removeCallback()); test("can search using a string", async () => { @register() class MockEmbeddingFunction extends EmbeddingFunction { toJSON(): object { return {}; } ndims() { return 1; } embeddingDataType(): arrow.Float { return new Float32(); } // Hardcoded embeddings for the sake of testing async computeQueryEmbeddings(_data: string) { switch (_data) { case "greetings": return [0.1]; case "farewell": return [0.2]; default: return null as never; } } // Hardcoded embeddings for the sake of testing async computeSourceEmbeddings(data: string[]) { return data.map((s) => { switch (s) { case "hello world": return [0.1]; case "goodbye world": return [0.2]; default: return null as never; } }); } } const func = new MockEmbeddingFunction(); const schema = LanceSchema({ text: func.sourceField(new arrow.Utf8()), vector: func.vectorField(), }); const db = await connect(tmpDir.name); const data = [{ text: "hello world" }, { text: "goodbye world" }]; const table = await db.createTable("test", data, { schema }); const results = await table.search("greetings").then((r) => r.toArray()); expect(results[0].text).toBe(data[0].text); const results2 = await table.search("farewell").then((r) => r.toArray()); expect(results2[0].text).toBe(data[1].text); }); test("rejects if no embedding function provided", async () => { const db = await connect(tmpDir.name); const data = [ { text: "hello world", vector: [0.1, 0.2, 0.3] }, { text: "goodbye world", vector: [0.4, 0.5, 0.6] }, ]; const table = await db.createTable("test", data); expect(table.search("hello")).rejects.toThrow( "No embedding functions are defined in the table", ); }); test.each([ [0.4, 0.5, 0.599], // number[] Float32Array.of(0.4, 0.5, 0.599), // Float32Array Float64Array.of(0.4, 0.5, 0.599), // Float64Array ])("can search using vectorlike datatypes", async (vectorlike) => { const db = await connect(tmpDir.name); const data = [ { text: "hello world", vector: [0.1, 0.2, 0.3] }, { text: "goodbye world", vector: [0.4, 0.5, 0.6] }, ]; const table = await db.createTable("test", data); // biome-ignore lint/suspicious/noExplicitAny: test const results: any[] = await table.search(vectorlike).toArray(); expect(results.length).toBe(2); expect(results[0].text).toBe(data[1].text); }); });