feat: add flag to enable faster manifest paths (#1612)

The new V2 manifest path scheme makes discovering the latest version of
a table constant time on object stores, regardless of the number of
versions in the table. See benchmarks in the PR here:
https://github.com/lancedb/lance/pull/2798

Closes #1583
This commit is contained in:
Will Jones
2024-09-09 11:34:36 -07:00
committed by GitHub
parent 029b01bbbf
commit 2a6586d6fb
16 changed files with 292 additions and 2 deletions

View File

@@ -13,6 +13,7 @@
import re
from datetime import timedelta
import os
import lancedb
import numpy as np
@@ -413,6 +414,40 @@ async def test_create_exist_ok_async(tmp_path):
# await db.create_table("test", schema=bad_schema, exist_ok=True)
@pytest.mark.asyncio
async def test_create_table_v2_manifest_paths_async(tmp_path):
db = await lancedb.connect_async(tmp_path)
# Create table in v2 mode with v2 manifest paths enabled
tbl = await db.create_table(
"test_v2_manifest_paths",
data=[{"id": 0}],
use_legacy_format=False,
enable_v2_manifest_paths=True,
)
assert await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions"
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)
# Start a table in V1 mode then migrate
tbl = await db.create_table(
"test_v2_migration",
data=[{"id": 0}],
use_legacy_format=False,
enable_v2_manifest_paths=False,
)
assert not await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions"
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d\.manifest", manifest)
await tbl.migrate_manifest_paths_v2()
assert await tbl.uses_v2_manifest_paths()
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)
def test_open_table_sync(tmp_path):
db = lancedb.connect(tmp_path)
db.create_table("test", data=[{"id": 0}])