feat: add flag to enable faster manifest paths (#1612)

The new V2 manifest path scheme makes discovering the latest version of
a table constant time on object stores, regardless of the number of
versions in the table. See benchmarks in the PR here:
https://github.com/lancedb/lance/pull/2798

Closes #1583
This commit is contained in:
Will Jones
2024-09-09 11:34:36 -07:00
committed by GitHub
parent 029b01bbbf
commit 2a6586d6fb
16 changed files with 292 additions and 2 deletions

View File

@@ -25,6 +25,7 @@ class Connection(object):
data: pa.RecordBatchReader,
storage_options: Optional[Dict[str, str]] = None,
data_storage_version: Optional[str] = None,
enable_v2_manifest_paths: Optional[bool] = None,
) -> Table: ...
async def create_empty_table(
self,
@@ -33,6 +34,7 @@ class Connection(object):
schema: pa.Schema,
storage_options: Optional[Dict[str, str]] = None,
data_storage_version: Optional[str] = None,
enable_v2_manifest_paths: Optional[bool] = None,
) -> Table: ...
class Table:

View File

@@ -567,6 +567,7 @@ class AsyncConnection(object):
*,
data_storage_version: Optional[str] = None,
use_legacy_format: Optional[bool] = None,
enable_v2_manifest_paths: Optional[bool] = None,
) -> AsyncTable:
"""Create an [AsyncTable][lancedb.table.AsyncTable] in the database.
@@ -618,6 +619,14 @@ class AsyncConnection(object):
If True, use the legacy format for the table. If False, use the new format.
The default is True while the new format is in beta.
This method is deprecated, use `data_storage_version` instead.
enable_v2_manifest_paths: bool, optional, default False
Use the new V2 manifest paths. These paths provide more efficient
opening of datasets with many versions on object stores. WARNING:
turning this on will make the dataset unreadable for older versions
of LanceDB (prior to 0.13.0). To migrate an existing dataset, instead
use the
[AsyncTable.migrate_manifest_paths_v2][lancedb.table.AsyncTable.migrate_manifest_paths_v2]
method.
Returns
@@ -761,6 +770,7 @@ class AsyncConnection(object):
schema,
storage_options=storage_options,
data_storage_version=data_storage_version,
enable_v2_manifest_paths=enable_v2_manifest_paths,
)
else:
data = data_to_reader(data, schema)
@@ -770,6 +780,7 @@ class AsyncConnection(object):
data,
storage_options=storage_options,
data_storage_version=data_storage_version,
enable_v2_manifest_paths=enable_v2_manifest_paths,
)
return AsyncTable(new_table)

View File

@@ -2539,3 +2539,34 @@ class AsyncTable:
List all indices that have been created with Self::create_index
"""
return await self._inner.list_indices()
async def uses_v2_manifest_paths(self) -> bool:
"""
Check if the table is using the new v2 manifest paths.
Returns
-------
bool
True if the table is using the new v2 manifest paths, False otherwise.
"""
return await self._inner.uses_v2_manifest_paths()
async def migrate_manifest_paths_v2(self):
"""
Migrate the manifest paths to the new format.
This will update the manifest to use the new v2 format for paths.
This function is idempotent, and can be run multiple times without
changing the state of the object store.
!!! danger
This should not be run while other concurrent operations are happening.
And it should also run until completion before resuming other operations.
You can use
[AsyncTable.uses_v2_manifest_paths][lancedb.table.AsyncTable.uses_v2_manifest_paths]
to check if the table is already using the new path style.
"""
await self._inner.migrate_manifest_paths_v2()

View File

@@ -13,6 +13,7 @@
import re
from datetime import timedelta
import os
import lancedb
import numpy as np
@@ -413,6 +414,40 @@ async def test_create_exist_ok_async(tmp_path):
# await db.create_table("test", schema=bad_schema, exist_ok=True)
@pytest.mark.asyncio
async def test_create_table_v2_manifest_paths_async(tmp_path):
db = await lancedb.connect_async(tmp_path)
# Create table in v2 mode with v2 manifest paths enabled
tbl = await db.create_table(
"test_v2_manifest_paths",
data=[{"id": 0}],
use_legacy_format=False,
enable_v2_manifest_paths=True,
)
assert await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions"
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)
# Start a table in V1 mode then migrate
tbl = await db.create_table(
"test_v2_migration",
data=[{"id": 0}],
use_legacy_format=False,
enable_v2_manifest_paths=False,
)
assert not await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions"
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d\.manifest", manifest)
await tbl.migrate_manifest_paths_v2()
assert await tbl.uses_v2_manifest_paths()
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)
def test_open_table_sync(tmp_path):
db = lancedb.connect(tmp_path)
db.create_table("test", data=[{"id": 0}])