mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-21 22:10:40 +00:00
feat: add flag to enable faster manifest paths (#1612)
The new V2 manifest path scheme makes discovering the latest version of a table constant time on object stores, regardless of the number of versions in the table. See benchmarks in the PR here: https://github.com/lancedb/lance/pull/2798 Closes #1583
This commit is contained in:
@@ -12,9 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import { readdirSync } from "fs";
|
||||
import { Field, Float64, Schema } from "apache-arrow";
|
||||
import * as tmp from "tmp";
|
||||
import { Connection, Table, connect } from "../lancedb";
|
||||
import { LocalTable } from "../lancedb/table";
|
||||
|
||||
describe("when connecting", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
@@ -134,4 +136,57 @@ describe("given a connection", () => {
|
||||
await table.add(data);
|
||||
await expect(isV2(table)).resolves.toBe(true);
|
||||
});
|
||||
|
||||
it("should be able to create tables with V2 manifest paths", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
let table = (await db.createEmptyTable(
|
||||
"test_manifest_paths_v2_empty",
|
||||
new Schema([new Field("id", new Float64(), true)]),
|
||||
{
|
||||
enableV2ManifestPaths: true,
|
||||
},
|
||||
)) as LocalTable;
|
||||
expect(await table.usesV2ManifestPaths()).toBe(true);
|
||||
|
||||
let manifestDir =
|
||||
tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions";
|
||||
readdirSync(manifestDir).forEach((file) => {
|
||||
expect(file).toMatch(/^\d{20}\.manifest$/);
|
||||
});
|
||||
|
||||
table = (await db.createTable("test_manifest_paths_v2", [{ id: 1 }], {
|
||||
enableV2ManifestPaths: true,
|
||||
})) as LocalTable;
|
||||
expect(await table.usesV2ManifestPaths()).toBe(true);
|
||||
manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions";
|
||||
readdirSync(manifestDir).forEach((file) => {
|
||||
expect(file).toMatch(/^\d{20}\.manifest$/);
|
||||
});
|
||||
});
|
||||
|
||||
it("should be able to migrate tables to the V2 manifest paths", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const table = (await db.createEmptyTable(
|
||||
"test_manifest_path_migration",
|
||||
new Schema([new Field("id", new Float64(), true)]),
|
||||
{
|
||||
enableV2ManifestPaths: false,
|
||||
},
|
||||
)) as LocalTable;
|
||||
|
||||
expect(await table.usesV2ManifestPaths()).toBe(false);
|
||||
|
||||
const manifestDir =
|
||||
tmpDir.name + "/test_manifest_path_migration.lance/_versions";
|
||||
readdirSync(manifestDir).forEach((file) => {
|
||||
expect(file).toMatch(/^\d\.manifest$/);
|
||||
});
|
||||
|
||||
await table.migrateManifestPathsV2();
|
||||
expect(await table.usesV2ManifestPaths()).toBe(true);
|
||||
|
||||
readdirSync(manifestDir).forEach((file) => {
|
||||
expect(file).toMatch(/^\d{20}\.manifest$/);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -52,6 +52,15 @@ export interface CreateTableOptions {
|
||||
*/
|
||||
dataStorageVersion?: string;
|
||||
|
||||
/**
|
||||
* Use the new V2 manifest paths. These paths provide more efficient
|
||||
* opening of datasets with many versions on object stores. WARNING:
|
||||
* turning this on will make the dataset unreadable for older versions
|
||||
* of LanceDB (prior to 0.10.0). To migrate an existing dataset, instead
|
||||
* use the {@link LocalTable#migrateManifestPathsV2} method.
|
||||
*/
|
||||
enableV2ManifestPaths?: boolean;
|
||||
|
||||
/**
|
||||
* If true then data files will be written with the legacy format
|
||||
*
|
||||
@@ -270,6 +279,7 @@ export class LocalConnection extends Connection {
|
||||
mode,
|
||||
cleanseStorageOptions(options?.storageOptions),
|
||||
dataStorageVersion,
|
||||
options?.enableV2ManifestPaths,
|
||||
);
|
||||
|
||||
return new LocalTable(innerTable);
|
||||
@@ -308,6 +318,7 @@ export class LocalConnection extends Connection {
|
||||
mode,
|
||||
cleanseStorageOptions(options?.storageOptions),
|
||||
dataStorageVersion,
|
||||
options?.enableV2ManifestPaths,
|
||||
);
|
||||
return new LocalTable(innerTable);
|
||||
}
|
||||
|
||||
@@ -697,4 +697,31 @@ export class LocalTable extends Table {
|
||||
on = Array.isArray(on) ? on : [on];
|
||||
return new MergeInsertBuilder(this.inner.mergeInsert(on));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the table uses the new manifest path scheme.
|
||||
*
|
||||
* This function will return true if the table uses the V2 manifest
|
||||
* path scheme.
|
||||
*/
|
||||
async usesV2ManifestPaths(): Promise<boolean> {
|
||||
return await this.inner.usesV2ManifestPaths();
|
||||
}
|
||||
|
||||
/**
|
||||
* Migrate the table to use the new manifest path scheme.
|
||||
*
|
||||
* This function will rename all V1 manifests to V2 manifest paths.
|
||||
* These paths provide more efficient opening of datasets with many versions
|
||||
* on object stores.
|
||||
*
|
||||
* This function is idempotent, and can be run multiple times without
|
||||
* changing the state of the object store.
|
||||
*
|
||||
* However, it should not be run while other concurrent operations are happening.
|
||||
* And it should also run until completion before resuming other operations.
|
||||
*/
|
||||
async migrateManifestPathsV2(): Promise<void> {
|
||||
await this.inner.migrateManifestPathsV2();
|
||||
}
|
||||
}
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.8.0",
|
||||
"version": "0.10.0-beta.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.8.0",
|
||||
"version": "0.10.0-beta.1",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -124,6 +124,7 @@ impl Connection {
|
||||
mode: String,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
data_storage_options: Option<String>,
|
||||
enable_v2_manifest_paths: Option<bool>,
|
||||
) -> napi::Result<Table> {
|
||||
let batches = ipc_file_to_batches(buf.to_vec())
|
||||
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
|
||||
@@ -140,6 +141,9 @@ impl Connection {
|
||||
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
|
||||
);
|
||||
}
|
||||
if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths {
|
||||
builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths);
|
||||
}
|
||||
let tbl = builder
|
||||
.execute()
|
||||
.await
|
||||
@@ -155,6 +159,7 @@ impl Connection {
|
||||
mode: String,
|
||||
storage_options: Option<HashMap<String, String>>,
|
||||
data_storage_options: Option<String>,
|
||||
enable_v2_manifest_paths: Option<bool>,
|
||||
) -> napi::Result<Table> {
|
||||
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
|
||||
napi::Error::from_reason(format!("Failed to marshal schema from JS to Rust: {}", e))
|
||||
@@ -175,6 +180,9 @@ impl Connection {
|
||||
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
|
||||
);
|
||||
}
|
||||
if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths {
|
||||
builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths);
|
||||
}
|
||||
let tbl = builder
|
||||
.execute()
|
||||
.await
|
||||
|
||||
@@ -347,6 +347,26 @@ impl Table {
|
||||
let on: Vec<_> = on.iter().map(String::as_str).collect();
|
||||
Ok(self.inner_ref()?.merge_insert(on.as_slice()).into())
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn uses_v2_manifest_paths(&self) -> napi::Result<bool> {
|
||||
self.inner_ref()?
|
||||
.as_native()
|
||||
.ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))?
|
||||
.uses_v2_manifest_paths()
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn migrate_manifest_paths_v2(&self) -> napi::Result<()> {
|
||||
self.inner_ref()?
|
||||
.as_native()
|
||||
.ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))?
|
||||
.migrate_manifest_paths_v2()
|
||||
.await
|
||||
.default_error()
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
|
||||
Reference in New Issue
Block a user