feat: upgrade to lance v0.25.0-beta.5 (#2248)

- adds `loss` into the index stats for vector index - now `optimize` can retrain the vector index --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-01-12 23:02:59 +00:00 · 2025-03-22 01:12:23 +08:00
parent ba1ded933a
commit 7ff6ec7fe3
11 changed files with 89 additions and 25 deletions
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1185,6 +1185,7 @@ class Table(ABC):
        *,
        cleanup_older_than: Optional[timedelta] = None,
        delete_unverified: bool = False,
+        retrain: bool = False,
    ):
        """
        Optimize the on-disk data and indices for better performance.
@@ -1208,6 +1209,11 @@ class Table(ABC):
            in-progress operation (e.g. appending new data) and these files will not
            be deleted unless they are at least 7 days old. If delete_unverified is True
            then these files will be deleted regardless of their age.
+        retrain: bool, default False
+            If True, retrain the vector indices, this would refine the IVF clustering
+            and quantization, which may improve the search accuracy. It's faster than
+            re-creating the index from scratch, so it's recommended to try this first,
+            when the data distribution has changed significantly.

        Experimental API
        ----------------
@@ -2342,6 +2348,7 @@ class LanceTable(Table):
        *,
        cleanup_older_than: Optional[timedelta] = None,
        delete_unverified: bool = False,
+        retrain: bool = False,
    ):
        """
        Optimize the on-disk data and indices for better performance.
@@ -2365,6 +2372,11 @@ class LanceTable(Table):
            in-progress operation (e.g. appending new data) and these files will not
            be deleted unless they are at least 7 days old. If delete_unverified is True
            then these files will be deleted regardless of their age.
+        retrain: bool, default False
+            If True, retrain the vector indices, this would refine the IVF clustering
+            and quantization, which may improve the search accuracy. It's faster than
+            re-creating the index from scratch, so it's recommended to try this first,
+            when the data distribution has changed significantly.

        Experimental API
        ----------------
@@ -2388,6 +2400,7 @@ class LanceTable(Table):
            self._table.optimize(
                cleanup_older_than=cleanup_older_than,
                delete_unverified=delete_unverified,
+                retrain=retrain,
            )
        )

@@ -3590,6 +3603,7 @@ class AsyncTable:
        *,
        cleanup_older_than: Optional[timedelta] = None,
        delete_unverified: bool = False,
+        retrain=False,
    ) -> OptimizeStats:
        """
        Optimize the on-disk data and indices for better performance.
@@ -3613,6 +3627,11 @@ class AsyncTable:
            in-progress operation (e.g. appending new data) and these files will not
            be deleted unless they are at least 7 days old. If delete_unverified is True
            then these files will be deleted regardless of their age.
+        retrain: bool, default False
+            If True, retrain the vector indices, this would refine the IVF clustering
+            and quantization, which may improve the search accuracy. It's faster than
+            re-creating the index from scratch, so it's recommended to try this first,
+            when the data distribution has changed significantly.

        Experimental API
        ----------------
@@ -3636,7 +3655,9 @@ class AsyncTable:
        if cleanup_older_than is not None:
            cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
        return await self._inner.optimize(
-            cleanup_since_ms=cleanup_since_ms, delete_unverified=delete_unverified
+            cleanup_since_ms=cleanup_since_ms,
+            delete_unverified=delete_unverified,
+            retrain=retrain,
        )

    async def list_indices(self) -> Iterable[IndexConfig]:
@@ -3729,6 +3750,8 @@ class IndexStatistics:
        The distance type used by the index.
    num_indices: Optional[int]
        The number of parts the index is split into.
+    loss: Optional[float]
+        The KMeans loss for the index, for only vector indices.
    """

    num_indexed_rows: int
@@ -3738,6 +3761,7 @@ class IndexStatistics:
    ]
    distance_type: Optional[Literal["l2", "cosine", "dot"]] = None
    num_indices: Optional[int] = None
+    loss: Optional[float] = None

    # This exists for backwards compatibility with an older API, which returned
    # a dictionary instead of a class.
--- a/python/python/tests/test_index.py
+++ b/python/python/tests/test_index.py
@@ -131,6 +131,7 @@ async def test_create_vector_index(some_table: AsyncTable):
    assert stats.num_indexed_rows == await some_table.count_rows()
    assert stats.num_unindexed_rows == 0
    assert stats.num_indices == 1
+    assert stats.loss >= 0.0


@pytest.mark.asyncio
@@ -154,6 +155,7 @@ async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
    assert stats.num_indexed_rows == await some_table.count_rows()
    assert stats.num_unindexed_rows == 0
    assert stats.num_indices == 1
+    assert stats.loss >= 0.0


@pytest.mark.asyncio
--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -235,6 +235,10 @@ impl Table {
                        dict.set_item("num_indices", num_indices)?;
                    }

+                    if let Some(loss) = stats.loss {
+                        dict.set_item("loss", loss)?;
+                    }
+
                    Ok(Some(dict.unbind()))
                })
            } else {
@@ -312,11 +316,12 @@ impl Table {
    }

    /// Optimize the on-disk data by compacting and pruning old data, for better performance.
-    #[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
+    #[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
    pub fn optimize(
        self_: PyRef<'_, Self>,
        cleanup_since_ms: Option<u64>,
        delete_unverified: Option<bool>,
+        retrain: Option<bool>,
    ) -> PyResult<Bound<'_, PyAny>> {
        let inner = self_.inner_ref()?.clone();
        let older_than = if let Some(ms) = cleanup_since_ms {
@@ -352,9 +357,10 @@ impl Table {
                .prune
                .unwrap();
            inner
-                .optimize(lancedb::table::OptimizeAction::Index(
-                    OptimizeOptions::default(),
-                ))
+                .optimize(lancedb::table::OptimizeAction::Index(match retrain {
+                    Some(true) => OptimizeOptions::retrain(),
+                    _ => OptimizeOptions::default(),
+                }))
                .await
                .infer_error()?;
            Ok(OptimizeStats {