mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 06:39:57 +00:00
Support shallow cloning a dataset at a specific location to create a new dataset, using the shallow_clone feature in Lance. Also introduce remote `clone` API for remote tables for this functionality.
309 lines
10 KiB
TypeScript
309 lines
10 KiB
TypeScript
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
import { readdirSync } from "fs";
|
|
import { Field, Float64, Schema } from "apache-arrow";
|
|
import * as tmp from "tmp";
|
|
import { Connection, Table, connect } from "../lancedb";
|
|
import { LocalTable } from "../lancedb/table";
|
|
|
|
describe("when connecting", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
beforeEach(() => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
it("should connect", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
expect(db.display()).toBe(
|
|
`ListingDatabase(uri=${tmpDir.name}, read_consistency_interval=None)`,
|
|
);
|
|
});
|
|
|
|
it("should allow read consistency interval to be specified", async () => {
|
|
const db = await connect(tmpDir.name, { readConsistencyInterval: 5 });
|
|
expect(db.display()).toBe(
|
|
`ListingDatabase(uri=${tmpDir.name}, read_consistency_interval=5s)`,
|
|
);
|
|
});
|
|
});
|
|
|
|
describe("given a connection", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let db: Connection;
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
db = await connect(tmpDir.name);
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
it("should raise an error if opening a non-existent table", async () => {
|
|
await expect(db.openTable("non-existent")).rejects.toThrow("was not found");
|
|
});
|
|
|
|
it("should raise an error if any operation is tried after it is closed", async () => {
|
|
expect(db.isOpen()).toBe(true);
|
|
await db.close();
|
|
expect(db.isOpen()).toBe(false);
|
|
await expect(db.tableNames()).rejects.toThrow("Connection is closed");
|
|
});
|
|
it("should be able to create a table from an object arg `createTable(options)`, or args `createTable(name, data, options)`", async () => {
|
|
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
|
await expect(tbl.countRows()).resolves.toBe(2);
|
|
|
|
tbl = await db.createTable({
|
|
name: "test",
|
|
data: [{ id: 3 }],
|
|
mode: "overwrite",
|
|
});
|
|
|
|
await expect(tbl.countRows()).resolves.toBe(1);
|
|
});
|
|
|
|
it("should be able to drop tables`", async () => {
|
|
await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
|
await db.createTable("test2", [{ id: 1 }, { id: 2 }]);
|
|
await db.createTable("test3", [{ id: 1 }, { id: 2 }]);
|
|
|
|
await expect(db.tableNames()).resolves.toEqual(["test", "test2", "test3"]);
|
|
|
|
await db.dropTable("test2");
|
|
|
|
await expect(db.tableNames()).resolves.toEqual(["test", "test3"]);
|
|
|
|
await db.dropAllTables();
|
|
|
|
await expect(db.tableNames()).resolves.toEqual([]);
|
|
|
|
// Make sure we can still create more tables after dropping all
|
|
|
|
await db.createTable("test4", [{ id: 1 }, { id: 2 }]);
|
|
});
|
|
|
|
it("should fail if creating table twice, unless overwrite is true", async () => {
|
|
let tbl = await db.createTable("test", [{ id: 1 }, { id: 2 }]);
|
|
await expect(tbl.countRows()).resolves.toBe(2);
|
|
await expect(
|
|
db.createTable("test", [{ id: 1 }, { id: 2 }]),
|
|
).rejects.toThrow();
|
|
tbl = await db.createTable("test", [{ id: 3 }], { mode: "overwrite" });
|
|
await expect(tbl.countRows()).resolves.toBe(1);
|
|
});
|
|
|
|
it("should respect limit and page token when listing tables", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
|
|
await db.createTable("b", [{ id: 1 }]);
|
|
await db.createTable("a", [{ id: 1 }]);
|
|
await db.createTable("c", [{ id: 1 }]);
|
|
|
|
let tables = await db.tableNames();
|
|
expect(tables).toEqual(["a", "b", "c"]);
|
|
|
|
tables = await db.tableNames({ limit: 1 });
|
|
expect(tables).toEqual(["a"]);
|
|
|
|
tables = await db.tableNames({ limit: 1, startAfter: "a" });
|
|
expect(tables).toEqual(["b"]);
|
|
|
|
tables = await db.tableNames({ startAfter: "a" });
|
|
expect(tables).toEqual(["b", "c"]);
|
|
});
|
|
|
|
it("should create tables in v2 mode", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const data = [...Array(10000).keys()].map((i) => ({ id: i }));
|
|
|
|
// Create in v1 mode
|
|
let table = await db.createTable("test", data, {
|
|
storageOptions: { newTableDataStorageVersion: "legacy" },
|
|
});
|
|
|
|
const isV2 = async (table: Table) => {
|
|
const data = await table
|
|
.query()
|
|
.limit(10000)
|
|
.toArrow({ maxBatchLength: 100000 });
|
|
return data.batches.length < 5;
|
|
};
|
|
|
|
await expect(isV2(table)).resolves.toBe(false);
|
|
|
|
// Create in v2 mode
|
|
table = await db.createTable("test_v2", data);
|
|
|
|
await expect(isV2(table)).resolves.toBe(true);
|
|
|
|
await table.add(data);
|
|
|
|
await expect(isV2(table)).resolves.toBe(true);
|
|
|
|
// Create empty in v2 mode
|
|
const schema = new Schema([new Field("id", new Float64(), true)]);
|
|
|
|
table = await db.createEmptyTable("test_v2_empty", schema, {
|
|
storageOptions: { newTableDataStorageVersion: "stable" },
|
|
});
|
|
|
|
await table.add(data);
|
|
await expect(isV2(table)).resolves.toBe(true);
|
|
});
|
|
|
|
it("should be able to create tables with V2 manifest paths", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
let table = (await db.createEmptyTable(
|
|
"test_manifest_paths_v2_empty",
|
|
new Schema([new Field("id", new Float64(), true)]),
|
|
{
|
|
enableV2ManifestPaths: true,
|
|
},
|
|
)) as LocalTable;
|
|
expect(await table.usesV2ManifestPaths()).toBe(true);
|
|
|
|
let manifestDir =
|
|
tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions";
|
|
readdirSync(manifestDir).forEach((file) => {
|
|
expect(file).toMatch(/^\d{20}\.manifest$/);
|
|
});
|
|
|
|
table = (await db.createTable("test_manifest_paths_v2", [{ id: 1 }], {
|
|
enableV2ManifestPaths: true,
|
|
})) as LocalTable;
|
|
expect(await table.usesV2ManifestPaths()).toBe(true);
|
|
manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions";
|
|
readdirSync(manifestDir).forEach((file) => {
|
|
expect(file).toMatch(/^\d{20}\.manifest$/);
|
|
});
|
|
});
|
|
|
|
it("should be able to migrate tables to the V2 manifest paths", async () => {
|
|
const db = await connect(tmpDir.name);
|
|
const table = (await db.createEmptyTable(
|
|
"test_manifest_path_migration",
|
|
new Schema([new Field("id", new Float64(), true)]),
|
|
{
|
|
enableV2ManifestPaths: false,
|
|
},
|
|
)) as LocalTable;
|
|
|
|
expect(await table.usesV2ManifestPaths()).toBe(false);
|
|
|
|
const manifestDir =
|
|
tmpDir.name + "/test_manifest_path_migration.lance/_versions";
|
|
readdirSync(manifestDir).forEach((file) => {
|
|
expect(file).toMatch(/^\d\.manifest$/);
|
|
});
|
|
|
|
await table.migrateManifestPathsV2();
|
|
expect(await table.usesV2ManifestPaths()).toBe(true);
|
|
|
|
readdirSync(manifestDir).forEach((file) => {
|
|
expect(file).toMatch(/^\d{20}\.manifest$/);
|
|
});
|
|
});
|
|
});
|
|
|
|
describe("clone table functionality", () => {
|
|
let tmpDir: tmp.DirResult;
|
|
let db: Connection;
|
|
beforeEach(async () => {
|
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
db = await connect(tmpDir.name);
|
|
});
|
|
afterEach(() => tmpDir.removeCallback());
|
|
|
|
it("should clone a table with latest version (default behavior)", async () => {
|
|
// Create source table with some data
|
|
const data = [
|
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
|
];
|
|
const sourceTable = await db.createTable("source", data);
|
|
|
|
// Add more data to create a new version
|
|
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
|
await sourceTable.add(moreData);
|
|
|
|
// Clone the table (should get latest version with 3 rows)
|
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
|
const clonedTable = await db.cloneTable("cloned", sourceUri);
|
|
|
|
// Verify cloned table has all 3 rows
|
|
expect(await clonedTable.countRows()).toBe(3);
|
|
expect((await db.tableNames()).includes("cloned")).toBe(true);
|
|
});
|
|
|
|
it("should clone a table from a specific version", async () => {
|
|
// Create source table with initial data
|
|
const data = [
|
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
|
];
|
|
const sourceTable = await db.createTable("source", data);
|
|
|
|
// Get the initial version
|
|
const initialVersion = await sourceTable.version();
|
|
|
|
// Add more data to create a new version
|
|
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
|
await sourceTable.add(moreData);
|
|
|
|
// Verify source now has 3 rows
|
|
expect(await sourceTable.countRows()).toBe(3);
|
|
|
|
// Clone from the initial version (should have only 2 rows)
|
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
|
const clonedTable = await db.cloneTable("cloned", sourceUri, {
|
|
sourceVersion: initialVersion,
|
|
});
|
|
|
|
// Verify cloned table has only the initial 2 rows
|
|
expect(await clonedTable.countRows()).toBe(2);
|
|
});
|
|
|
|
it("should clone a table from a tagged version", async () => {
|
|
// Create source table with initial data
|
|
const data = [
|
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
|
];
|
|
const sourceTable = await db.createTable("source", data);
|
|
|
|
// Create a tag for the current version
|
|
const tags = await sourceTable.tags();
|
|
await tags.create("v1.0", await sourceTable.version());
|
|
|
|
// Add more data after the tag
|
|
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
|
await sourceTable.add(moreData);
|
|
|
|
// Verify source now has 3 rows
|
|
expect(await sourceTable.countRows()).toBe(3);
|
|
|
|
// Clone from the tagged version (should have only 2 rows)
|
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
|
const clonedTable = await db.cloneTable("cloned", sourceUri, {
|
|
sourceTag: "v1.0",
|
|
});
|
|
|
|
// Verify cloned table has only the tagged version's 2 rows
|
|
expect(await clonedTable.countRows()).toBe(2);
|
|
});
|
|
|
|
it("should fail when attempting deep clone", async () => {
|
|
// Create source table with some data
|
|
const data = [
|
|
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
|
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
|
];
|
|
await db.createTable("source", data);
|
|
|
|
// Try to create a deep clone (should fail)
|
|
const sourceUri = `${tmpDir.name}/source.lance`;
|
|
await expect(
|
|
db.cloneTable("cloned", sourceUri, { isShallow: false }),
|
|
).rejects.toThrow("Deep clone is not yet implemented");
|
|
});
|
|
});
|