feat: cleanup and compaction (#518)

#488
This commit is contained in:
Will Jones
2023-10-11 12:49:12 -07:00
committed by Weston Pace
parent 541b06664f
commit c07207c661
12 changed files with 394 additions and 7 deletions

View File

@@ -16,6 +16,7 @@ from __future__ import annotations
import inspect
import os
from abc import ABC, abstractmethod
from datetime import timedelta
from functools import cached_property
from typing import Any, Iterable, List, Optional, Union
@@ -24,7 +25,7 @@ import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
from lance import LanceDataset
from lance.dataset import ReaderLike
from lance.dataset import CleanupStats, ReaderLike
from lance.vector import vec_to_table
from .common import DATA, VEC, VECTOR_COLUMN_NAME
@@ -394,6 +395,17 @@ class LanceTable(Table):
raise ValueError(f"Invalid version {version}")
self._reset_dataset(version=version)
try:
# Accessing the property updates the cached value
_ = self._dataset
except Exception as e:
if "not found" in str(e):
raise ValueError(
f"Version {version} no longer exists. Was it cleaned up?"
)
else:
raise e
def restore(self, version: int = None):
"""Restore a version of the table. This is an in-place operation.
@@ -870,6 +882,48 @@ class LanceTable(Table):
},
)
def cleanup_old_versions(
self,
older_than: Optional[timedelta] = None,
*,
delete_unverified: bool = False,
) -> CleanupStats:
"""
Clean up old versions of the table, freeing disk space.
Parameters
----------
older_than: timedelta, default None
The minimum age of the version to delete. If None, then this defaults
to two weeks.
delete_unverified: bool, default False
Because they may be part of an in-progress transaction, files newer
than 7 days old are not deleted by default. If you are sure that
there are no in-progress transactions, then you can set this to True
to delete all files older than `older_than`.
Returns
-------
CleanupStats
The stats of the cleanup operation, including how many bytes were
freed.
"""
return self.to_lance().cleanup_old_versions(
older_than, delete_unverified=delete_unverified
)
def compact_files(self, *args, **kwargs):
"""
Run the compaction process on the table.
This can be run after making several small appends to optimize the table
for faster reads.
Arguments are passed onto :meth:`lance.dataset.DatasetOptimizer.compact_files`.
For most cases, the default should be fine.
"""
return self.to_lance().optimize.compact_files(*args, **kwargs)
def _sanitize_schema(
data: pa.Table,

View File

@@ -12,6 +12,7 @@
# limitations under the License.
import functools
from datetime import timedelta
from pathlib import Path
from typing import List
from unittest.mock import PropertyMock, patch
@@ -442,3 +443,31 @@ def test_empty_query(db):
df = table.search().select(["id"]).where("text='bar'").limit(1).to_pandas()
val = df.id.iloc[0]
assert val == 1
def test_compact_cleanup(db):
table = LanceTable.create(
db,
"my_table",
data=[{"text": "foo", "id": 0}, {"text": "bar", "id": 1}],
)
table.add([{"text": "baz", "id": 2}])
assert len(table) == 3
assert table.version == 3
stats = table.compact_files()
assert len(table) == 3
assert table.version == 4
assert stats.fragments_removed > 0
assert stats.fragments_added == 1
stats = table.cleanup_old_versions()
assert stats.bytes_removed == 0
stats = table.cleanup_old_versions(older_than=timedelta(0), delete_unverified=True)
assert stats.bytes_removed > 0
assert table.version == 4
with pytest.raises(Exception, match="Version 3 no longer exists"):
table.checkout(3)