feat: add flag to enable faster manifest paths (#1612)

The new V2 manifest path scheme makes discovering the latest version of
a table constant time on object stores, regardless of the number of
versions in the table. See benchmarks in the PR here:
https://github.com/lancedb/lance/pull/2798

Closes #1583
This commit is contained in:
Will Jones
2024-09-09 11:34:36 -07:00
committed by GitHub
parent 029b01bbbf
commit 2a6586d6fb
16 changed files with 292 additions and 2 deletions

View File

@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
import { readdirSync } from "fs";
import { Field, Float64, Schema } from "apache-arrow";
import * as tmp from "tmp";
import { Connection, Table, connect } from "../lancedb";
import { LocalTable } from "../lancedb/table";
describe("when connecting", () => {
let tmpDir: tmp.DirResult;
@@ -134,4 +136,57 @@ describe("given a connection", () => {
await table.add(data);
await expect(isV2(table)).resolves.toBe(true);
});
it("should be able to create tables with V2 manifest paths", async () => {
const db = await connect(tmpDir.name);
let table = (await db.createEmptyTable(
"test_manifest_paths_v2_empty",
new Schema([new Field("id", new Float64(), true)]),
{
enableV2ManifestPaths: true,
},
)) as LocalTable;
expect(await table.usesV2ManifestPaths()).toBe(true);
let manifestDir =
tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions";
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
table = (await db.createTable("test_manifest_paths_v2", [{ id: 1 }], {
enableV2ManifestPaths: true,
})) as LocalTable;
expect(await table.usesV2ManifestPaths()).toBe(true);
manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions";
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
});
it("should be able to migrate tables to the V2 manifest paths", async () => {
const db = await connect(tmpDir.name);
const table = (await db.createEmptyTable(
"test_manifest_path_migration",
new Schema([new Field("id", new Float64(), true)]),
{
enableV2ManifestPaths: false,
},
)) as LocalTable;
expect(await table.usesV2ManifestPaths()).toBe(false);
const manifestDir =
tmpDir.name + "/test_manifest_path_migration.lance/_versions";
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d\.manifest$/);
});
await table.migrateManifestPathsV2();
expect(await table.usesV2ManifestPaths()).toBe(true);
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
});
});

View File

@@ -52,6 +52,15 @@ export interface CreateTableOptions {
*/
dataStorageVersion?: string;
/**
* Use the new V2 manifest paths. These paths provide more efficient
* opening of datasets with many versions on object stores. WARNING:
* turning this on will make the dataset unreadable for older versions
* of LanceDB (prior to 0.10.0). To migrate an existing dataset, instead
* use the {@link LocalTable#migrateManifestPathsV2} method.
*/
enableV2ManifestPaths?: boolean;
/**
* If true then data files will be written with the legacy format
*
@@ -270,6 +279,7 @@ export class LocalConnection extends Connection {
mode,
cleanseStorageOptions(options?.storageOptions),
dataStorageVersion,
options?.enableV2ManifestPaths,
);
return new LocalTable(innerTable);
@@ -308,6 +318,7 @@ export class LocalConnection extends Connection {
mode,
cleanseStorageOptions(options?.storageOptions),
dataStorageVersion,
options?.enableV2ManifestPaths,
);
return new LocalTable(innerTable);
}

View File

@@ -697,4 +697,31 @@ export class LocalTable extends Table {
on = Array.isArray(on) ? on : [on];
return new MergeInsertBuilder(this.inner.mergeInsert(on));
}
/**
* Check if the table uses the new manifest path scheme.
*
* This function will return true if the table uses the V2 manifest
* path scheme.
*/
async usesV2ManifestPaths(): Promise<boolean> {
return await this.inner.usesV2ManifestPaths();
}
/**
* Migrate the table to use the new manifest path scheme.
*
* This function will rename all V1 manifests to V2 manifest paths.
* These paths provide more efficient opening of datasets with many versions
* on object stores.
*
* This function is idempotent, and can be run multiple times without
* changing the state of the object store.
*
* However, it should not be run while other concurrent operations are happening.
* And it should also run until completion before resuming other operations.
*/
async migrateManifestPathsV2(): Promise<void> {
await this.inner.migrateManifestPathsV2();
}
}

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.8.0",
"version": "0.10.0-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.8.0",
"version": "0.10.0-beta.1",
"cpu": [
"x64",
"arm64"

View File

@@ -124,6 +124,7 @@ impl Connection {
mode: String,
storage_options: Option<HashMap<String, String>>,
data_storage_options: Option<String>,
enable_v2_manifest_paths: Option<bool>,
) -> napi::Result<Table> {
let batches = ipc_file_to_batches(buf.to_vec())
.map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?;
@@ -140,6 +141,9 @@ impl Connection {
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
);
}
if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths {
builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths);
}
let tbl = builder
.execute()
.await
@@ -155,6 +159,7 @@ impl Connection {
mode: String,
storage_options: Option<HashMap<String, String>>,
data_storage_options: Option<String>,
enable_v2_manifest_paths: Option<bool>,
) -> napi::Result<Table> {
let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| {
napi::Error::from_reason(format!("Failed to marshal schema from JS to Rust: {}", e))
@@ -175,6 +180,9 @@ impl Connection {
.map_err(|e| napi::Error::from_reason(format!("{}", e)))?,
);
}
if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths {
builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths);
}
let tbl = builder
.execute()
.await

View File

@@ -347,6 +347,26 @@ impl Table {
let on: Vec<_> = on.iter().map(String::as_str).collect();
Ok(self.inner_ref()?.merge_insert(on.as_slice()).into())
}
#[napi(catch_unwind)]
pub async fn uses_v2_manifest_paths(&self) -> napi::Result<bool> {
self.inner_ref()?
.as_native()
.ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))?
.uses_v2_manifest_paths()
.await
.default_error()
}
#[napi(catch_unwind)]
pub async fn migrate_manifest_paths_v2(&self) -> napi::Result<()> {
self.inner_ref()?
.as_native()
.ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))?
.migrate_manifest_paths_v2()
.await
.default_error()
}
}
#[napi(object)]