feat!: refactor ConnectionInternal into a Database trait (#2067)

This opens up the door for more custom database implementations than the
two we have today. The biggest change should be inivisble:
`ConnectionInternal` has been renamed to `Database`, made public, and
refactored

However, there are a few breaking changes. `data_storage_version` and
`enable_v2_manifest_paths` have been moved from options on
`create_table` to options for the database which are now set via
`storage_options`.

Before:
```
db = connect(uri)
tbl = db.create_table("my_table", data, data_storage_version="legacy", enable_v2_manifest_paths=True)
```

After:
```
db = connect(uri, storage_options={
  "new_table_enable_v2_manifest_paths": "true",
  "new_table_data_storage_version": "legacy"
})
tbl = db.create_table("my_table", data)
```

BREAKING CHANGE: the data_storage_version, enable_v2_manifest_paths
options have moved from options to create_table to storage_options.
BREAKING CHANGE: the use_legacy_format option has been removed,
data_storage_version has replaced it for some time now
This commit is contained in:
Weston Pace
2025-02-04 14:35:14 -08:00
committed by GitHub
parent f6eef14313
commit c269524b2f
20 changed files with 1131 additions and 876 deletions

View File

@@ -299,12 +299,12 @@ def test_create_exist_ok(tmp_db: lancedb.DBConnection):
@pytest.mark.asyncio
async def test_connect(tmp_path):
db = await lancedb.connect_async(tmp_path)
assert str(db) == f"NativeDatabase(uri={tmp_path}, read_consistency_interval=None)"
assert str(db) == f"ListingDatabase(uri={tmp_path}, read_consistency_interval=None)"
db = await lancedb.connect_async(
tmp_path, read_consistency_interval=timedelta(seconds=5)
)
assert str(db) == f"NativeDatabase(uri={tmp_path}, read_consistency_interval=5s)"
assert str(db) == f"ListingDatabase(uri={tmp_path}, read_consistency_interval=5s)"
@pytest.mark.asyncio
@@ -396,13 +396,16 @@ async def test_create_exist_ok_async(tmp_db_async: lancedb.AsyncConnection):
@pytest.mark.asyncio
async def test_create_table_v2_manifest_paths_async(tmp_path):
db = await lancedb.connect_async(tmp_path)
db_with_v2_paths = await lancedb.connect_async(
tmp_path, storage_options={"new_table_enable_v2_manifest_paths": "true"}
)
db_no_v2_paths = await lancedb.connect_async(
tmp_path, storage_options={"new_table_enable_v2_manifest_paths": "false"}
)
# Create table in v2 mode with v2 manifest paths enabled
tbl = await db.create_table(
tbl = await db_with_v2_paths.create_table(
"test_v2_manifest_paths",
data=[{"id": 0}],
use_legacy_format=False,
enable_v2_manifest_paths=True,
)
assert await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions"
@@ -410,11 +413,9 @@ async def test_create_table_v2_manifest_paths_async(tmp_path):
assert re.match(r"\d{20}\.manifest", manifest)
# Start a table in V1 mode then migrate
tbl = await db.create_table(
tbl = await db_no_v2_paths.create_table(
"test_v2_migration",
data=[{"id": 0}],
use_legacy_format=False,
enable_v2_manifest_paths=False,
)
assert not await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions"
@@ -583,7 +584,7 @@ def test_empty_or_nonexistent_table(mem_db: lancedb.DBConnection):
@pytest.mark.asyncio
async def test_create_in_v2_mode(mem_db_async: lancedb.AsyncConnection):
async def test_create_in_v2_mode():
def make_data():
for i in range(10):
yield pa.record_batch([pa.array([x for x in range(1024)])], names=["x"])
@@ -594,10 +595,13 @@ async def test_create_in_v2_mode(mem_db_async: lancedb.AsyncConnection):
schema = pa.schema([pa.field("x", pa.int64())])
# Create table in v1 mode
tbl = await mem_db_async.create_table(
"test", data=make_data(), schema=schema, data_storage_version="legacy"
v1_db = await lancedb.connect_async(
"memory://", storage_options={"new_table_data_storage_version": "legacy"}
)
tbl = await v1_db.create_table("test", data=make_data(), schema=schema)
async def is_in_v2_mode(tbl):
batches = (
await tbl.query().limit(10 * 1024).to_batches(max_batch_length=1024 * 10)
@@ -610,10 +614,12 @@ async def test_create_in_v2_mode(mem_db_async: lancedb.AsyncConnection):
assert not await is_in_v2_mode(tbl)
# Create table in v2 mode
tbl = await mem_db_async.create_table(
"test_v2", data=make_data(), schema=schema, use_legacy_format=False
v2_db = await lancedb.connect_async(
"memory://", storage_options={"new_table_data_storage_version": "stable"}
)
tbl = await v2_db.create_table("test_v2", data=make_data(), schema=schema)
assert await is_in_v2_mode(tbl)
# Add data (should remain in v2 mode)
@@ -622,20 +628,18 @@ async def test_create_in_v2_mode(mem_db_async: lancedb.AsyncConnection):
assert await is_in_v2_mode(tbl)
# Create empty table in v2 mode and add data
tbl = await mem_db_async.create_table(
"test_empty_v2", data=None, schema=schema, use_legacy_format=False
)
tbl = await v2_db.create_table("test_empty_v2", data=None, schema=schema)
await tbl.add(make_table())
assert await is_in_v2_mode(tbl)
# Create empty table uses v1 mode by default
tbl = await mem_db_async.create_table(
"test_empty_v2_default", data=None, schema=schema, data_storage_version="legacy"
)
# Db uses v2 mode by default
db = await lancedb.connect_async("memory://")
tbl = await db.create_table("test_empty_v2_default", data=None, schema=schema)
await tbl.add(make_table())
assert not await is_in_v2_mode(tbl)
assert await is_in_v2_mode(tbl)
def test_replace_index(mem_db: lancedb.DBConnection):