feat: add time travel operations to the async API (#1070)

This commit is contained in:
Weston Pace
2024-03-12 09:20:23 -07:00
parent f822255683
commit 47daf9b7b0
13 changed files with 472 additions and 87 deletions

View File

@@ -34,6 +34,10 @@ class Table:
async def create_index(
self, column: str, config: Optional[Index], replace: Optional[bool]
): ...
async def version(self) -> int: ...
async def checkout(self, version): ...
async def checkout_latest(self): ...
async def restore(self): ...
async def connect(
uri: str,

View File

@@ -529,7 +529,7 @@ class AsyncConnection(object):
on_bad_vectors: Optional[str] = None,
fill_value: Optional[float] = None,
embedding_functions: Optional[List[EmbeddingFunctionConfig]] = None,
) -> Table:
) -> AsyncTable:
"""Create a [Table][lancedb.table.Table] in the database.
Parameters

View File

@@ -2354,3 +2354,59 @@ class AsyncTable:
The names of the columns to drop.
"""
raise NotImplementedError
async def version(self) -> int:
"""
Retrieve the version of the table
LanceDb supports versioning. Every operation that modifies the table increases
version. As long as a version hasn't been deleted you can `[Self::checkout]`
that version to view the data at that point. In addition, you can
`[Self::restore]` the version to replace the current table with a previous
version.
"""
return await self._inner.version()
async def checkout(self, version):
"""
Checks out a specific version of the Table
Any read operation on the table will now access the data at the checked out
version. As a consequence, calling this method will disable any read consistency
interval that was previously set.
This is a read-only operation that turns the table into a sort of "view"
or "detached head". Other table instances will not be affected. To make the
change permanent you can use the `[Self::restore]` method.
Any operation that modifies the table will fail while the table is in a checked
out state.
To return the table to a normal state use `[Self::checkout_latest]`
"""
await self._inner.checkout(version)
async def checkout_latest(self):
"""
Ensures the table is pointing at the latest version
This can be used to manually update a table when the read_consistency_interval
is None
It can also be used to undo a `[Self::checkout]` operation
"""
await self._inner.checkout_latest()
async def restore(self):
"""
Restore the table to the currently checked out version
This operation will fail if checkout has not been called previously
This operation will overwrite the latest version of the table with a
previous version. Any changes made since the checked out version will
no longer be visible.
Once the operation concludes the table will no longer be in a checked
out state and the read_consistency_interval, if any, will apply.
"""
await self._inner.restore()

View File

@@ -974,3 +974,37 @@ def test_drop_columns(tmp_path):
table = LanceTable.create(db, "my_table", data=data)
table.drop_columns(["category"])
assert table.to_arrow().column_names == ["id"]
@pytest.mark.asyncio
async def test_time_travel(db_async: AsyncConnection):
# Setup
table = await db_async.create_table("some_table", data=[{"id": 0}])
version = await table.version()
await table.add([{"id": 1}])
assert await table.count_rows() == 2
# Make sure we can rewind
await table.checkout(version)
assert await table.count_rows() == 1
# Can't add data in time travel mode
with pytest.raises(
ValueError,
match="table cannot be modified when a specific version is checked out",
):
await table.add([{"id": 2}])
# Can go back to normal mode
await table.checkout_latest()
assert await table.count_rows() == 2
# Should be able to add data again
await table.add([{"id": 3}])
assert await table.count_rows() == 3
# Now checkout and restore
await table.checkout(version)
await table.restore()
assert await table.count_rows() == 1
# Should be able to add data
await table.add([{"id": 4}])
assert await table.count_rows() == 2
# Can't use restore if not checked out
with pytest.raises(ValueError, match="checkout before running restore"):
await table.restore()