diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index bf069d2c..18f6f90b 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -2451,7 +2451,10 @@ class AsyncTable: await self._inner.restore() async def optimize( - self, *, cleanup_older_than: Optional[timedelta] = None + self, + *, + cleanup_older_than: Optional[timedelta] = None, + delete_unverified: bool = False, ) -> OptimizeStats: """ Optimize the on-disk data and indices for better performance. @@ -2470,6 +2473,11 @@ class AsyncTable: All files belonging to versions older than this will be removed. Set to 0 days to remove all versions except the latest. The latest version is never removed. + delete_unverified: bool, default False + Files leftover from a failed transaction may appear to be part of an + in-progress operation (e.g. appending new data) and these files will not + be deleted unless they are at least 7 days old. If delete_unverified is True + then these files will be deleted regardless of their age. Experimental API ---------------- @@ -2491,7 +2499,7 @@ class AsyncTable: """ if cleanup_older_than is not None: cleanup_older_than = round(cleanup_older_than.total_seconds() * 1000) - return await self._inner.optimize(cleanup_older_than) + return await self._inner.optimize(cleanup_older_than, delete_unverified) async def list_indices(self) -> IndexConfig: """ diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 832fa76e..0d6beeb4 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -8,6 +8,7 @@ from pathlib import Path from time import sleep from typing import List from unittest.mock import PropertyMock, patch +import os import lance import lancedb @@ -1052,3 +1053,25 @@ async def test_optimize(db_async: AsyncConnection): assert stats.prune.old_versions_removed == 3 assert await table.query().to_arrow() == pa.table({"x": [[1], [2]]}) + + +@pytest.mark.asyncio +async def test_optimize_delete_unverified(db_async: AsyncConnection, tmp_path): + table = await db_async.create_table( + "test", + data=[{"x": [1]}], + ) + await table.add( + data=[ + {"x": [2]}, + ], + ) + version = await table.version() + path = tmp_path / "test.lance" / "_versions" / f"{version - 1}.manifest" + os.remove(path) + stats = await table.optimize(delete_unverified=False) + assert stats.prune.old_versions_removed == 0 + stats = await table.optimize( + cleanup_older_than=timedelta(seconds=0), delete_unverified=True + ) + assert stats.prune.old_versions_removed == 2 diff --git a/python/src/table.rs b/python/src/table.rs index 497b0ca2..346c14d2 100644 --- a/python/src/table.rs +++ b/python/src/table.rs @@ -248,6 +248,7 @@ impl Table { pub fn optimize( self_: PyRef<'_, Self>, cleanup_since_ms: Option, + delete_unverified: Option, ) -> PyResult> { let inner = self_.inner_ref()?.clone(); let older_than = if let Some(ms) = cleanup_since_ms { @@ -275,7 +276,7 @@ impl Table { let prune_stats = inner .optimize(OptimizeAction::Prune { older_than, - delete_unverified: None, + delete_unverified, error_if_tagged_old_versions: None, }) .await