feat: add the optimize function to nodejs and async python (#1257)

The optimize function is pretty crucial for getting good performance
when building a large scale dataset but it was only exposed in rust
(many sync python users are probably doing this via to_lance today)

This PR adds the optimize function to nodejs and to python.

I left the function marked experimental because I think there will
likely be changes to optimization (e.g. if we add features like
"optimize on write"). I also only exposed the `cleanup_older_than`
configuration parameter since this one is very commonly used and the
rest have sensible defaults and we don't really know why we would
recommend different values for these defaults anyways.
This commit is contained in:
Weston Pace
2024-05-20 07:09:31 -07:00
committed by GitHub
parent 5349e8b1db
commit 4f512af024
9 changed files with 407 additions and 20 deletions

View File

@@ -2,7 +2,9 @@ use arrow::{
ffi_stream::ArrowArrayStreamReader,
pyarrow::{FromPyArrow, ToPyArrow},
};
use lancedb::table::{AddDataMode, Table as LanceDbTable};
use lancedb::table::{
AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
};
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
pyclass, pymethods,
@@ -17,6 +19,40 @@ use crate::{
query::Query,
};
/// Statistics about a compaction operation.
#[pyclass(get_all)]
#[derive(Clone, Debug)]
pub struct CompactionStats {
/// The number of fragments removed
pub fragments_removed: u64,
/// The number of new, compacted fragments added
pub fragments_added: u64,
/// The number of data files removed
pub files_removed: u64,
/// The number of new, compacted data files added
pub files_added: u64,
}
/// Statistics about a cleanup operation
#[pyclass(get_all)]
#[derive(Clone, Debug)]
pub struct RemovalStats {
/// The number of bytes removed
pub bytes_removed: u64,
/// The number of old versions removed
pub old_versions_removed: u64,
}
/// Statistics about an optimize operation
#[pyclass(get_all)]
#[derive(Clone, Debug)]
pub struct OptimizeStats {
/// Statistics about the compaction operation
pub compaction: CompactionStats,
/// Statistics about the removal operation
pub prune: RemovalStats,
}
#[pyclass]
pub struct Table {
// We keep a copy of the name to use if the inner table is dropped
@@ -191,4 +227,47 @@ impl Table {
pub fn query(&self) -> Query {
Query::new(self.inner_ref().unwrap().query())
}
pub fn optimize(self_: PyRef<'_, Self>, cleanup_since_ms: Option<u64>) -> PyResult<&PyAny> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let compaction_stats = inner
.optimize(OptimizeAction::Compact {
options: lancedb::table::CompactionOptions::default(),
remap_options: None,
})
.await
.infer_error()?
.compaction
.unwrap();
let older_than = cleanup_since_ms.map(|since| Duration::milliseconds(since as i64));
let prune_stats = inner
.optimize(OptimizeAction::Prune {
older_than,
delete_unverified: None,
})
.await
.infer_error()?
.prune
.unwrap();
inner
.optimize(lancedb::table::OptimizeAction::Index(
OptimizeOptions::default(),
))
.await
.infer_error()?;
Ok(OptimizeStats {
compaction: CompactionStats {
files_added: compaction_stats.files_added as u64,
files_removed: compaction_stats.files_removed as u64,
fragments_added: compaction_stats.fragments_added as u64,
fragments_removed: compaction_stats.fragments_removed as u64,
},
prune: RemovalStats {
bytes_removed: prune_stats.bytes_removed,
old_versions_removed: prune_stats.old_versions,
},
})
})
}
}