feat(python): add delete unverified parameter (#1542)

PR fixes #1527
This commit is contained in:
Gagan Bhullar
2024-08-15 10:01:32 -06:00
committed by GitHub
parent b624fc59eb
commit 20faa4424b
3 changed files with 35 additions and 3 deletions

View File

@@ -2451,7 +2451,10 @@ class AsyncTable:
await self._inner.restore() await self._inner.restore()
async def optimize( async def optimize(
self, *, cleanup_older_than: Optional[timedelta] = None self,
*,
cleanup_older_than: Optional[timedelta] = None,
delete_unverified: bool = False,
) -> OptimizeStats: ) -> OptimizeStats:
""" """
Optimize the on-disk data and indices for better performance. Optimize the on-disk data and indices for better performance.
@@ -2470,6 +2473,11 @@ class AsyncTable:
All files belonging to versions older than this will be removed. Set All files belonging to versions older than this will be removed. Set
to 0 days to remove all versions except the latest. The latest version to 0 days to remove all versions except the latest. The latest version
is never removed. is never removed.
delete_unverified: bool, default False
Files leftover from a failed transaction may appear to be part of an
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
Experimental API Experimental API
---------------- ----------------
@@ -2491,7 +2499,7 @@ class AsyncTable:
""" """
if cleanup_older_than is not None: if cleanup_older_than is not None:
cleanup_older_than = round(cleanup_older_than.total_seconds() * 1000) cleanup_older_than = round(cleanup_older_than.total_seconds() * 1000)
return await self._inner.optimize(cleanup_older_than) return await self._inner.optimize(cleanup_older_than, delete_unverified)
async def list_indices(self) -> IndexConfig: async def list_indices(self) -> IndexConfig:
""" """

View File

@@ -8,6 +8,7 @@ from pathlib import Path
from time import sleep from time import sleep
from typing import List from typing import List
from unittest.mock import PropertyMock, patch from unittest.mock import PropertyMock, patch
import os
import lance import lance
import lancedb import lancedb
@@ -1052,3 +1053,25 @@ async def test_optimize(db_async: AsyncConnection):
assert stats.prune.old_versions_removed == 3 assert stats.prune.old_versions_removed == 3
assert await table.query().to_arrow() == pa.table({"x": [[1], [2]]}) assert await table.query().to_arrow() == pa.table({"x": [[1], [2]]})
@pytest.mark.asyncio
async def test_optimize_delete_unverified(db_async: AsyncConnection, tmp_path):
table = await db_async.create_table(
"test",
data=[{"x": [1]}],
)
await table.add(
data=[
{"x": [2]},
],
)
version = await table.version()
path = tmp_path / "test.lance" / "_versions" / f"{version - 1}.manifest"
os.remove(path)
stats = await table.optimize(delete_unverified=False)
assert stats.prune.old_versions_removed == 0
stats = await table.optimize(
cleanup_older_than=timedelta(seconds=0), delete_unverified=True
)
assert stats.prune.old_versions_removed == 2

View File

@@ -248,6 +248,7 @@ impl Table {
pub fn optimize( pub fn optimize(
self_: PyRef<'_, Self>, self_: PyRef<'_, Self>,
cleanup_since_ms: Option<u64>, cleanup_since_ms: Option<u64>,
delete_unverified: Option<bool>,
) -> PyResult<Bound<'_, PyAny>> { ) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone(); let inner = self_.inner_ref()?.clone();
let older_than = if let Some(ms) = cleanup_since_ms { let older_than = if let Some(ms) = cleanup_since_ms {
@@ -275,7 +276,7 @@ impl Table {
let prune_stats = inner let prune_stats = inner
.optimize(OptimizeAction::Prune { .optimize(OptimizeAction::Prune {
older_than, older_than,
delete_unverified: None, delete_unverified,
error_if_tagged_old_versions: None, error_if_tagged_old_versions: None,
}) })
.await .await