diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts
index 495baddcb..3e48e8f05 100644
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -84,6 +84,16 @@ export interface OptimizeOptions {
* tbl.optimize({cleanupOlderThan: new Date()});
*/
cleanupOlderThan: Date;
+ /**
+ * Because they may be part of an in-progress transaction, files newer than
+ * 7 days old are not deleted by default. If you are sure that there are no
+ * in-progress transactions, then you can set this to true to delete all
+ * files older than `cleanupOlderThan`.
+ *
+ * **WARNING**: This should only be set to true if you can guarantee that
+ * no other process is currently working on this dataset. Otherwise the
+ * dataset could be put into a corrupted state.
+ */
deleteUnverified: boolean;
}
@@ -501,19 +511,7 @@ export abstract class Table {
* - Index: Optimizes the indices, adding new data to existing indices
*
*
- * Experimental API
- * ----------------
- *
- * The optimization process is undergoing active development and may change.
- * Our goal with these changes is to improve the performance of optimization and
- * reduce the complexity.
- *
- * That being said, it is essential today to run optimize if you want the best
- * performance. It should be stable and safe to use in production, but it our
- * hope that the API may be simplified (or not even need to be called) in the
- * future.
- *
- * The frequency an application shoudl call optimize is based on the frequency of
+ * The frequency an application should call optimize is based on the frequency of
* data modifications. If data is frequently added, deleted, or updated then
* optimize should be run frequently. A good rule of thumb is to run optimize if
* you have added or modified 100,000 or more records or run more than 20 data
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index acaf534f5..0f0acaea0 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1506,22 +1506,17 @@ class Table(ABC):
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
+
+ .. warning::
+
+ This should only be set to True if you can guarantee that no other
+ process is currently working on this dataset. Otherwise the dataset
+ could be put into a corrupted state.
+
retrain: bool, default False
This parameter is no longer used and is deprecated.
- Experimental API
- ----------------
-
- The optimization process is undergoing active development and may change.
- Our goal with these changes is to improve the performance of optimization and
- reduce the complexity.
-
- That being said, it is essential today to run optimize if you want the best
- performance. It should be stable and safe to use in production, but it our
- hope that the API may be simplified (or not even need to be called) in the
- future.
-
- The frequency an application shoudl call optimize is based on the frequency of
+ The frequency an application should call optimize is based on the frequency of
data modifications. If data is frequently added, deleted, or updated then
optimize should be run frequently. A good rule of thumb is to run optimize if
you have added or modified 100,000 or more records or run more than 20 data
@@ -3047,22 +3042,17 @@ class LanceTable(Table):
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
+
+ .. warning::
+
+ This should only be set to True if you can guarantee that no other
+ process is currently working on this dataset. Otherwise the dataset
+ could be put into a corrupted state.
+
retrain: bool, default False
This parameter is no longer used and is deprecated.
- Experimental API
- ----------------
-
- The optimization process is undergoing active development and may change.
- Our goal with these changes is to improve the performance of optimization and
- reduce the complexity.
-
- That being said, it is essential today to run optimize if you want the best
- performance. It should be stable and safe to use in production, but it our
- hope that the API may be simplified (or not even need to be called) in the
- future.
-
- The frequency an application shoudl call optimize is based on the frequency of
+ The frequency an application should call optimize is based on the frequency of
data modifications. If data is frequently added, deleted, or updated then
optimize should be run frequently. A good rule of thumb is to run optimize if
you have added or modified 100,000 or more records or run more than 20 data
@@ -4630,22 +4620,17 @@ class AsyncTable:
in-progress operation (e.g. appending new data) and these files will not
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
+
+ .. warning::
+
+ This should only be set to True if you can guarantee that no other
+ process is currently working on this dataset. Otherwise the dataset
+ could be put into a corrupted state.
+
retrain: bool, default False
This parameter is no longer used and is deprecated.
- Experimental API
- ----------------
-
- The optimization process is undergoing active development and may change.
- Our goal with these changes is to improve the performance of optimization and
- reduce the complexity.
-
- That being said, it is essential today to run optimize if you want the best
- performance. It should be stable and safe to use in production, but it our
- hope that the API may be simplified (or not even need to be called) in the
- future.
-
- The frequency an application shoudl call optimize is based on the frequency of
+ The frequency an application should call optimize is based on the frequency of
data modifications. If data is frequently added, deleted, or updated then
optimize should be run frequently. A good rule of thumb is to run optimize if
you have added or modified 100,000 or more records or run more than 20 data
diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs
index 3f88f8782..db0636a1c 100644
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -951,17 +951,7 @@ impl Table {
/// * Prune: Removes old versions of the dataset
/// * Index: Optimizes the indices, adding new data to existing indices
///
- ///
- ///
- /// The optimization process is undergoing active development and may change.
- /// Our goal with these changes is to improve the performance of optimization and
- /// reduce the complexity.
- ///
- /// That being said, it is essential today to run optimize if you want the best
- /// performance. It should be stable and safe to use in production, but it our
- /// hope that the API may be simplified (or not even need to be called) in the future.
- ///
- /// The frequency an application shoudl call optimize is based on the frequency of
+ /// The frequency an application should call optimize is based on the frequency of
/// data modifications. If data is frequently added, deleted, or updated then
/// optimize should be run frequently. A good rule of thumb is to run optimize if
/// you have added or modified 100,000 or more records or run more than 20 data
diff --git a/rust/lancedb/src/table/optimize.rs b/rust/lancedb/src/table/optimize.rs
index 3d9a7d476..1b2966faf 100644
--- a/rust/lancedb/src/table/optimize.rs
+++ b/rust/lancedb/src/table/optimize.rs
@@ -64,6 +64,9 @@ pub enum OptimizeAction {
older_than: Option,
/// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default.
/// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`.
+ ///
+ /// **WARNING**: This should only be set to true if you can guarantee that no other process is
+ /// currently working on this dataset. Otherwise the dataset could be put into a corrupted state.
delete_unverified: Option,
/// If true, an error will be returned if there are any old versions that are still tagged.
error_if_tagged_old_versions: Option,
@@ -117,6 +120,10 @@ pub(crate) async fn optimize_indices(table: &NativeTable, options: &OptimizeOpti
/// If you are sure that there are no in-progress transactions, then you
/// can set this to True to delete all files older than `older_than`.
///
+/// **WARNING**: This should only be set to true if you can guarantee that
+/// no other process is currently working on this dataset. Otherwise the
+/// dataset could be put into a corrupted state.
+///
/// This calls into [lance::dataset::Dataset::cleanup_old_versions] and
/// returns the result.
pub(crate) async fn cleanup_old_versions(