diff --git a/Cargo.toml b/Cargo.toml index e16528c3..d9ec9a3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ categories = ["database-implementations"] lance = { "version" = "=0.17.0", "features" = ["dynamodb"] } lance-index = { "version" = "=0.17.0" } lance-linalg = { "version" = "=0.17.0" } +lance-table = { "version" = "=0.17.0" } lance-testing = { "version" = "=0.17.0" } lance-datafusion = { "version" = "=0.17.0" } lance-encoding = { "version" = "=0.17.0" } diff --git a/nodejs/__test__/connection.test.ts b/nodejs/__test__/connection.test.ts index 616c48b3..3541e94f 100644 --- a/nodejs/__test__/connection.test.ts +++ b/nodejs/__test__/connection.test.ts @@ -12,9 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +import { readdirSync } from "fs"; import { Field, Float64, Schema } from "apache-arrow"; import * as tmp from "tmp"; import { Connection, Table, connect } from "../lancedb"; +import { LocalTable } from "../lancedb/table"; describe("when connecting", () => { let tmpDir: tmp.DirResult; @@ -134,4 +136,57 @@ describe("given a connection", () => { await table.add(data); await expect(isV2(table)).resolves.toBe(true); }); + + it("should be able to create tables with V2 manifest paths", async () => { + const db = await connect(tmpDir.name); + let table = (await db.createEmptyTable( + "test_manifest_paths_v2_empty", + new Schema([new Field("id", new Float64(), true)]), + { + enableV2ManifestPaths: true, + }, + )) as LocalTable; + expect(await table.usesV2ManifestPaths()).toBe(true); + + let manifestDir = + tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions"; + readdirSync(manifestDir).forEach((file) => { + expect(file).toMatch(/^\d{20}\.manifest$/); + }); + + table = (await db.createTable("test_manifest_paths_v2", [{ id: 1 }], { + enableV2ManifestPaths: true, + })) as LocalTable; + expect(await table.usesV2ManifestPaths()).toBe(true); + manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions"; + readdirSync(manifestDir).forEach((file) => { + expect(file).toMatch(/^\d{20}\.manifest$/); + }); + }); + + it("should be able to migrate tables to the V2 manifest paths", async () => { + const db = await connect(tmpDir.name); + const table = (await db.createEmptyTable( + "test_manifest_path_migration", + new Schema([new Field("id", new Float64(), true)]), + { + enableV2ManifestPaths: false, + }, + )) as LocalTable; + + expect(await table.usesV2ManifestPaths()).toBe(false); + + const manifestDir = + tmpDir.name + "/test_manifest_path_migration.lance/_versions"; + readdirSync(manifestDir).forEach((file) => { + expect(file).toMatch(/^\d\.manifest$/); + }); + + await table.migrateManifestPathsV2(); + expect(await table.usesV2ManifestPaths()).toBe(true); + + readdirSync(manifestDir).forEach((file) => { + expect(file).toMatch(/^\d{20}\.manifest$/); + }); + }); }); diff --git a/nodejs/lancedb/connection.ts b/nodejs/lancedb/connection.ts index c0ef9540..af177348 100644 --- a/nodejs/lancedb/connection.ts +++ b/nodejs/lancedb/connection.ts @@ -52,6 +52,15 @@ export interface CreateTableOptions { */ dataStorageVersion?: string; + /** + * Use the new V2 manifest paths. These paths provide more efficient + * opening of datasets with many versions on object stores. WARNING: + * turning this on will make the dataset unreadable for older versions + * of LanceDB (prior to 0.10.0). To migrate an existing dataset, instead + * use the {@link LocalTable#migrateManifestPathsV2} method. + */ + enableV2ManifestPaths?: boolean; + /** * If true then data files will be written with the legacy format * @@ -270,6 +279,7 @@ export class LocalConnection extends Connection { mode, cleanseStorageOptions(options?.storageOptions), dataStorageVersion, + options?.enableV2ManifestPaths, ); return new LocalTable(innerTable); @@ -308,6 +318,7 @@ export class LocalConnection extends Connection { mode, cleanseStorageOptions(options?.storageOptions), dataStorageVersion, + options?.enableV2ManifestPaths, ); return new LocalTable(innerTable); } diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts index 11a76e9c..4228fa0d 100644 --- a/nodejs/lancedb/table.ts +++ b/nodejs/lancedb/table.ts @@ -697,4 +697,31 @@ export class LocalTable extends Table { on = Array.isArray(on) ? on : [on]; return new MergeInsertBuilder(this.inner.mergeInsert(on)); } + + /** + * Check if the table uses the new manifest path scheme. + * + * This function will return true if the table uses the V2 manifest + * path scheme. + */ + async usesV2ManifestPaths(): Promise { + return await this.inner.usesV2ManifestPaths(); + } + + /** + * Migrate the table to use the new manifest path scheme. + * + * This function will rename all V1 manifests to V2 manifest paths. + * These paths provide more efficient opening of datasets with many versions + * on object stores. + * + * This function is idempotent, and can be run multiple times without + * changing the state of the object store. + * + * However, it should not be run while other concurrent operations are happening. + * And it should also run until completion before resuming other operations. + */ + async migrateManifestPathsV2(): Promise { + await this.inner.migrateManifestPathsV2(); + } } diff --git a/nodejs/package-lock.json b/nodejs/package-lock.json index bbe9ab2f..e9157ff2 100644 --- a/nodejs/package-lock.json +++ b/nodejs/package-lock.json @@ -1,12 +1,12 @@ { "name": "@lancedb/lancedb", - "version": "0.8.0", + "version": "0.10.0-beta.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@lancedb/lancedb", - "version": "0.8.0", + "version": "0.10.0-beta.1", "cpu": [ "x64", "arm64" diff --git a/nodejs/src/connection.rs b/nodejs/src/connection.rs index e0510ab3..6508e939 100644 --- a/nodejs/src/connection.rs +++ b/nodejs/src/connection.rs @@ -124,6 +124,7 @@ impl Connection { mode: String, storage_options: Option>, data_storage_options: Option, + enable_v2_manifest_paths: Option, ) -> napi::Result { let batches = ipc_file_to_batches(buf.to_vec()) .map_err(|e| napi::Error::from_reason(format!("Failed to read IPC file: {}", e)))?; @@ -140,6 +141,9 @@ impl Connection { .map_err(|e| napi::Error::from_reason(format!("{}", e)))?, ); } + if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths { + builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths); + } let tbl = builder .execute() .await @@ -155,6 +159,7 @@ impl Connection { mode: String, storage_options: Option>, data_storage_options: Option, + enable_v2_manifest_paths: Option, ) -> napi::Result
{ let schema = ipc_file_to_schema(schema_buf.to_vec()).map_err(|e| { napi::Error::from_reason(format!("Failed to marshal schema from JS to Rust: {}", e)) @@ -175,6 +180,9 @@ impl Connection { .map_err(|e| napi::Error::from_reason(format!("{}", e)))?, ); } + if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths { + builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths); + } let tbl = builder .execute() .await diff --git a/nodejs/src/table.rs b/nodejs/src/table.rs index e0a17565..338677e4 100644 --- a/nodejs/src/table.rs +++ b/nodejs/src/table.rs @@ -347,6 +347,26 @@ impl Table { let on: Vec<_> = on.iter().map(String::as_str).collect(); Ok(self.inner_ref()?.merge_insert(on.as_slice()).into()) } + + #[napi(catch_unwind)] + pub async fn uses_v2_manifest_paths(&self) -> napi::Result { + self.inner_ref()? + .as_native() + .ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))? + .uses_v2_manifest_paths() + .await + .default_error() + } + + #[napi(catch_unwind)] + pub async fn migrate_manifest_paths_v2(&self) -> napi::Result<()> { + self.inner_ref()? + .as_native() + .ok_or_else(|| napi::Error::from_reason("This cannot be run on a remote table"))? + .migrate_manifest_paths_v2() + .await + .default_error() + } } #[napi(object)] diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 55c3db99..33b1f07c 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -25,6 +25,7 @@ class Connection(object): data: pa.RecordBatchReader, storage_options: Optional[Dict[str, str]] = None, data_storage_version: Optional[str] = None, + enable_v2_manifest_paths: Optional[bool] = None, ) -> Table: ... async def create_empty_table( self, @@ -33,6 +34,7 @@ class Connection(object): schema: pa.Schema, storage_options: Optional[Dict[str, str]] = None, data_storage_version: Optional[str] = None, + enable_v2_manifest_paths: Optional[bool] = None, ) -> Table: ... class Table: diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py index d2345e4a..8195d09f 100644 --- a/python/python/lancedb/db.py +++ b/python/python/lancedb/db.py @@ -567,6 +567,7 @@ class AsyncConnection(object): *, data_storage_version: Optional[str] = None, use_legacy_format: Optional[bool] = None, + enable_v2_manifest_paths: Optional[bool] = None, ) -> AsyncTable: """Create an [AsyncTable][lancedb.table.AsyncTable] in the database. @@ -618,6 +619,14 @@ class AsyncConnection(object): If True, use the legacy format for the table. If False, use the new format. The default is True while the new format is in beta. This method is deprecated, use `data_storage_version` instead. + enable_v2_manifest_paths: bool, optional, default False + Use the new V2 manifest paths. These paths provide more efficient + opening of datasets with many versions on object stores. WARNING: + turning this on will make the dataset unreadable for older versions + of LanceDB (prior to 0.13.0). To migrate an existing dataset, instead + use the + [AsyncTable.migrate_manifest_paths_v2][lancedb.table.AsyncTable.migrate_manifest_paths_v2] + method. Returns @@ -761,6 +770,7 @@ class AsyncConnection(object): schema, storage_options=storage_options, data_storage_version=data_storage_version, + enable_v2_manifest_paths=enable_v2_manifest_paths, ) else: data = data_to_reader(data, schema) @@ -770,6 +780,7 @@ class AsyncConnection(object): data, storage_options=storage_options, data_storage_version=data_storage_version, + enable_v2_manifest_paths=enable_v2_manifest_paths, ) return AsyncTable(new_table) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 53e624a0..89cf1247 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2539,3 +2539,34 @@ class AsyncTable: List all indices that have been created with Self::create_index """ return await self._inner.list_indices() + + async def uses_v2_manifest_paths(self) -> bool: + """ + Check if the table is using the new v2 manifest paths. + + Returns + ------- + bool + True if the table is using the new v2 manifest paths, False otherwise. + """ + return await self._inner.uses_v2_manifest_paths() + + async def migrate_manifest_paths_v2(self): + """ + Migrate the manifest paths to the new format. + + This will update the manifest to use the new v2 format for paths. + + This function is idempotent, and can be run multiple times without + changing the state of the object store. + + !!! danger + + This should not be run while other concurrent operations are happening. + And it should also run until completion before resuming other operations. + + You can use + [AsyncTable.uses_v2_manifest_paths][lancedb.table.AsyncTable.uses_v2_manifest_paths] + to check if the table is already using the new path style. + """ + await self._inner.migrate_manifest_paths_v2() diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py index 5b7f3c42..a7f3d33b 100644 --- a/python/python/tests/test_db.py +++ b/python/python/tests/test_db.py @@ -13,6 +13,7 @@ import re from datetime import timedelta +import os import lancedb import numpy as np @@ -413,6 +414,40 @@ async def test_create_exist_ok_async(tmp_path): # await db.create_table("test", schema=bad_schema, exist_ok=True) +@pytest.mark.asyncio +async def test_create_table_v2_manifest_paths_async(tmp_path): + db = await lancedb.connect_async(tmp_path) + # Create table in v2 mode with v2 manifest paths enabled + tbl = await db.create_table( + "test_v2_manifest_paths", + data=[{"id": 0}], + use_legacy_format=False, + enable_v2_manifest_paths=True, + ) + assert await tbl.uses_v2_manifest_paths() + manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions" + for manifest in os.listdir(manifests_dir): + assert re.match(r"\d{20}\.manifest", manifest) + + # Start a table in V1 mode then migrate + tbl = await db.create_table( + "test_v2_migration", + data=[{"id": 0}], + use_legacy_format=False, + enable_v2_manifest_paths=False, + ) + assert not await tbl.uses_v2_manifest_paths() + manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions" + for manifest in os.listdir(manifests_dir): + assert re.match(r"\d\.manifest", manifest) + + await tbl.migrate_manifest_paths_v2() + assert await tbl.uses_v2_manifest_paths() + + for manifest in os.listdir(manifests_dir): + assert re.match(r"\d{20}\.manifest", manifest) + + def test_open_table_sync(tmp_path): db = lancedb.connect(tmp_path) db.create_table("test", data=[{"id": 0}]) diff --git a/python/src/connection.rs b/python/src/connection.rs index 43c8c6ca..4f7e20a9 100644 --- a/python/src/connection.rs +++ b/python/src/connection.rs @@ -81,6 +81,7 @@ impl Connection { data: Bound<'_, PyAny>, storage_options: Option>, data_storage_version: Option, + enable_v2_manifest_paths: Option, ) -> PyResult> { let inner = self_.get_inner()?.clone(); @@ -93,6 +94,10 @@ impl Connection { builder = builder.storage_options(storage_options); } + if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths { + builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths); + } + if let Some(data_storage_version) = data_storage_version.as_ref() { builder = builder.data_storage_version( LanceFileVersion::from_str(data_storage_version) @@ -113,6 +118,7 @@ impl Connection { schema: Bound<'_, PyAny>, storage_options: Option>, data_storage_version: Option, + enable_v2_manifest_paths: Option, ) -> PyResult> { let inner = self_.get_inner()?.clone(); @@ -126,6 +132,10 @@ impl Connection { builder = builder.storage_options(storage_options); } + if let Some(enable_v2_manifest_paths) = enable_v2_manifest_paths { + builder = builder.enable_v2_manifest_paths(enable_v2_manifest_paths); + } + if let Some(data_storage_version) = data_storage_version.as_ref() { builder = builder.data_storage_version( LanceFileVersion::from_str(data_storage_version) diff --git a/python/src/table.rs b/python/src/table.rs index 346c14d2..ee949a4b 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -303,4 +303,28 @@ impl Table { }) }) } + + pub fn uses_v2_manifest_paths(self_: PyRef<'_, Self>) -> PyResult> { + let inner = self_.inner_ref()?.clone(); + future_into_py(self_.py(), async move { + inner + .as_native() + .ok_or_else(|| PyValueError::new_err("This cannot be run on a remote table"))? + .uses_v2_manifest_paths() + .await + .infer_error() + }) + } + + pub fn migrate_manifest_paths_v2(self_: PyRef<'_, Self>) -> PyResult> { + let inner = self_.inner_ref()?.clone(); + future_into_py(self_.py(), async move { + inner + .as_native() + .ok_or_else(|| PyValueError::new_err("This cannot be run on a remote table"))? + .migrate_manifest_paths_v2() + .await + .infer_error() + }) + } } diff --git a/rust/lancedb/Cargo.toml b/rust/lancedb/Cargo.toml index c547b882..818e62cf 100644 --- a/rust/lancedb/Cargo.toml +++ b/rust/lancedb/Cargo.toml @@ -27,6 +27,7 @@ lazy_static.workspace = true lance = { workspace = true } lance-datafusion.workspace = true lance-index = { workspace = true } +lance-table = { workspace = true } lance-linalg = { workspace = true } lance-testing = { workspace = true } lance-encoding = { workspace = true } diff --git a/rust/lancedb/src/connection.rs b/rust/lancedb/src/connection.rs index 57763d23..e46899a4 100644 --- a/rust/lancedb/src/connection.rs +++ b/rust/lancedb/src/connection.rs @@ -142,6 +142,7 @@ pub struct CreateTableBuilder { pub(crate) table_definition: Option, pub(crate) embeddings: Vec<(EmbeddingDefinition, Arc)>, pub(crate) data_storage_version: Option, + pub(crate) enable_v2_manifest_paths: Option, } // Builder methods that only apply when we have initial data @@ -156,6 +157,7 @@ impl CreateTableBuilder { table_definition: None, embeddings: Vec::new(), data_storage_version: None, + enable_v2_manifest_paths: None, } } @@ -188,6 +190,7 @@ impl CreateTableBuilder { write_options: self.write_options, embeddings: self.embeddings, data_storage_version: self.data_storage_version, + enable_v2_manifest_paths: self.enable_v2_manifest_paths, }; Ok((data, builder)) } @@ -222,6 +225,7 @@ impl CreateTableBuilder { write_options: WriteOptions::default(), embeddings: Vec::new(), data_storage_version: None, + enable_v2_manifest_paths: None, } } @@ -284,6 +288,23 @@ impl CreateTableBuilder { self } + /// Set whether to use V2 manifest paths for the table. (default: false) + /// + /// These paths provide more efficient opening of tables with many + /// versions on object stores. + /// + ///
Turning this on will make the dataset unreadable + /// for older versions of LanceDB (prior to 0.10.0).
+ /// + /// To migrate an existing dataset, instead use the + /// [[NativeTable::migrate_manifest_paths_v2]]. + /// + /// This has no effect in LanceDB Cloud. + pub fn enable_v2_manifest_paths(mut self, use_v2_manifest_paths: bool) -> Self { + self.enable_v2_manifest_paths = Some(use_v2_manifest_paths); + self + } + /// Set the data storage version. /// /// The default is `LanceFileVersion::Legacy`. @@ -976,7 +997,10 @@ impl ConnectionInternal for Database { if matches!(&options.mode, CreateTableMode::Overwrite) { write_params.mode = WriteMode::Overwrite; } + write_params.data_storage_version = options.data_storage_version; + write_params.enable_v2_manifest_paths = + options.enable_v2_manifest_paths.unwrap_or_default(); match NativeTable::create( &table_uri, diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 88c23533..3e66d2c7 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -43,6 +43,7 @@ use lance_index::vector::pq::PQBuildParams; use lance_index::vector::sq::builder::SQBuildParams; use lance_index::DatasetIndexExt; use lance_index::IndexType; +use lance_table::io::commit::ManifestNamingScheme; use log::info; use serde::{Deserialize, Serialize}; use snafu::whatever; @@ -1646,6 +1647,35 @@ impl NativeTable { Default::default(), )?)) } + + /// Check whether the table uses V2 manifest paths. + /// + /// See [Self::migrate_manifest_paths_v2] and [ManifestNamingScheme] for + /// more information. + pub async fn uses_v2_manifest_paths(&self) -> Result { + let dataset = self.dataset.get().await?; + Ok(dataset.manifest_naming_scheme == ManifestNamingScheme::V2) + } + + /// Migrate the table to use the new manifest path scheme. + /// + /// This function will rename all V1 manifests to V2 manifest paths. + /// These paths provide more efficient opening of datasets with many versions + /// on object stores. + /// + /// This function is idempotent, and can be run multiple times without + /// changing the state of the object store. + /// + /// However, it should not be run while other concurrent operations are happening. + /// And it should also run until completion before resuming other operations. + /// + /// You can use [Self::uses_v2_manifest_paths] to check if the table is already + /// using V2 manifest paths. + pub async fn migrate_manifest_paths_v2(&self) -> Result<()> { + let mut dataset = self.dataset.get_mut().await?; + dataset.migrate_manifest_paths_v2().await?; + Ok(()) + } } #[async_trait::async_trait]