feat: add the optimize function to nodejs and async python (#1257)

The optimize function is pretty crucial for getting good performance when building a large scale dataset but it was only exposed in rust (many sync python users are probably doing this via to_lance today) This PR adds the optimize function to nodejs and to python. I left the function marked experimental because I think there will likely be changes to optimization (e.g. if we add features like "optimize on write"). I also only exposed the `cleanup_older_than` configuration parameter since this one is very commonly used and the rest have sensible defaults and we don't really know why we would recommend different values for these defaults anyways.
2025-12-27 23:12:58 +00:00 · 2024-05-20 07:09:31 -07:00
parent 5349e8b1db
commit 4f512af024
9 changed files with 407 additions and 20 deletions
--- a/python/src/table.rs
+++ b/python/src/table.rs
@@ -2,7 +2,9 @@ use arrow::{
    ffi_stream::ArrowArrayStreamReader,
    pyarrow::{FromPyArrow, ToPyArrow},
 };
-use lancedb::table::{AddDataMode, Table as LanceDbTable};
+use lancedb::table::{
+    AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
+};
 use pyo3::{
    exceptions::{PyRuntimeError, PyValueError},
    pyclass, pymethods,
@@ -17,6 +19,40 @@ use crate::{
    query::Query,
 };

+/// Statistics about a compaction operation.
+#[pyclass(get_all)]
+#[derive(Clone, Debug)]
+pub struct CompactionStats {
+    /// The number of fragments removed
+    pub fragments_removed: u64,
+    /// The number of new, compacted fragments added
+    pub fragments_added: u64,
+    /// The number of data files removed
+    pub files_removed: u64,
+    /// The number of new, compacted data files added
+    pub files_added: u64,
+}
+
+/// Statistics about a cleanup operation
+#[pyclass(get_all)]
+#[derive(Clone, Debug)]
+pub struct RemovalStats {
+    /// The number of bytes removed
+    pub bytes_removed: u64,
+    /// The number of old versions removed
+    pub old_versions_removed: u64,
+}
+
+/// Statistics about an optimize operation
+#[pyclass(get_all)]
+#[derive(Clone, Debug)]
+pub struct OptimizeStats {
+    /// Statistics about the compaction operation
+    pub compaction: CompactionStats,
+    /// Statistics about the removal operation
+    pub prune: RemovalStats,
+}
+
 #[pyclass]
 pub struct Table {
    // We keep a copy of the name to use if the inner table is dropped
@@ -191,4 +227,47 @@ impl Table {
    pub fn query(&self) -> Query {
        Query::new(self.inner_ref().unwrap().query())
    }
+
+    pub fn optimize(self_: PyRef<'_, Self>, cleanup_since_ms: Option<u64>) -> PyResult<&PyAny> {
+        let inner = self_.inner_ref()?.clone();
+        future_into_py(self_.py(), async move {
+            let compaction_stats = inner
+                .optimize(OptimizeAction::Compact {
+                    options: lancedb::table::CompactionOptions::default(),
+                    remap_options: None,
+                })
+                .await
+                .infer_error()?
+                .compaction
+                .unwrap();
+            let older_than = cleanup_since_ms.map(|since| Duration::milliseconds(since as i64));
+            let prune_stats = inner
+                .optimize(OptimizeAction::Prune {
+                    older_than,
+                    delete_unverified: None,
+                })
+                .await
+                .infer_error()?
+                .prune
+                .unwrap();
+            inner
+                .optimize(lancedb::table::OptimizeAction::Index(
+                    OptimizeOptions::default(),
+                ))
+                .await
+                .infer_error()?;
+            Ok(OptimizeStats {
+                compaction: CompactionStats {
+                    files_added: compaction_stats.files_added as u64,
+                    files_removed: compaction_stats.files_removed as u64,
+                    fragments_added: compaction_stats.fragments_added as u64,
+                    fragments_removed: compaction_stats.fragments_removed as u64,
+                },
+                prune: RemovalStats {
+                    bytes_removed: prune_stats.bytes_removed,
+                    old_versions_removed: prune_stats.old_versions,
+                },
+            })
+        })
+    }
 }