diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts index 495baddcb..3e48e8f05 100644 --- a/nodejs/lancedb/table.ts +++ b/nodejs/lancedb/table.ts @@ -84,6 +84,16 @@ export interface OptimizeOptions { * tbl.optimize({cleanupOlderThan: new Date()}); */ cleanupOlderThan: Date; + /** + * Because they may be part of an in-progress transaction, files newer than + * 7 days old are not deleted by default. If you are sure that there are no + * in-progress transactions, then you can set this to true to delete all + * files older than `cleanupOlderThan`. + * + * **WARNING**: This should only be set to true if you can guarantee that + * no other process is currently working on this dataset. Otherwise the + * dataset could be put into a corrupted state. + */ deleteUnverified: boolean; } @@ -501,19 +511,7 @@ export abstract class Table { * - Index: Optimizes the indices, adding new data to existing indices * * - * Experimental API - * ---------------- - * - * The optimization process is undergoing active development and may change. - * Our goal with these changes is to improve the performance of optimization and - * reduce the complexity. - * - * That being said, it is essential today to run optimize if you want the best - * performance. It should be stable and safe to use in production, but it our - * hope that the API may be simplified (or not even need to be called) in the - * future. - * - * The frequency an application shoudl call optimize is based on the frequency of + * The frequency an application should call optimize is based on the frequency of * data modifications. If data is frequently added, deleted, or updated then * optimize should be run frequently. A good rule of thumb is to run optimize if * you have added or modified 100,000 or more records or run more than 20 data diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index acaf534f5..0f0acaea0 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -1506,22 +1506,17 @@ class Table(ABC): in-progress operation (e.g. appending new data) and these files will not be deleted unless they are at least 7 days old. If delete_unverified is True then these files will be deleted regardless of their age. + + .. warning:: + + This should only be set to True if you can guarantee that no other + process is currently working on this dataset. Otherwise the dataset + could be put into a corrupted state. + retrain: bool, default False This parameter is no longer used and is deprecated. - Experimental API - ---------------- - - The optimization process is undergoing active development and may change. - Our goal with these changes is to improve the performance of optimization and - reduce the complexity. - - That being said, it is essential today to run optimize if you want the best - performance. It should be stable and safe to use in production, but it our - hope that the API may be simplified (or not even need to be called) in the - future. - - The frequency an application shoudl call optimize is based on the frequency of + The frequency an application should call optimize is based on the frequency of data modifications. If data is frequently added, deleted, or updated then optimize should be run frequently. A good rule of thumb is to run optimize if you have added or modified 100,000 or more records or run more than 20 data @@ -3047,22 +3042,17 @@ class LanceTable(Table): in-progress operation (e.g. appending new data) and these files will not be deleted unless they are at least 7 days old. If delete_unverified is True then these files will be deleted regardless of their age. + + .. warning:: + + This should only be set to True if you can guarantee that no other + process is currently working on this dataset. Otherwise the dataset + could be put into a corrupted state. + retrain: bool, default False This parameter is no longer used and is deprecated. - Experimental API - ---------------- - - The optimization process is undergoing active development and may change. - Our goal with these changes is to improve the performance of optimization and - reduce the complexity. - - That being said, it is essential today to run optimize if you want the best - performance. It should be stable and safe to use in production, but it our - hope that the API may be simplified (or not even need to be called) in the - future. - - The frequency an application shoudl call optimize is based on the frequency of + The frequency an application should call optimize is based on the frequency of data modifications. If data is frequently added, deleted, or updated then optimize should be run frequently. A good rule of thumb is to run optimize if you have added or modified 100,000 or more records or run more than 20 data @@ -4630,22 +4620,17 @@ class AsyncTable: in-progress operation (e.g. appending new data) and these files will not be deleted unless they are at least 7 days old. If delete_unverified is True then these files will be deleted regardless of their age. + + .. warning:: + + This should only be set to True if you can guarantee that no other + process is currently working on this dataset. Otherwise the dataset + could be put into a corrupted state. + retrain: bool, default False This parameter is no longer used and is deprecated. - Experimental API - ---------------- - - The optimization process is undergoing active development and may change. - Our goal with these changes is to improve the performance of optimization and - reduce the complexity. - - That being said, it is essential today to run optimize if you want the best - performance. It should be stable and safe to use in production, but it our - hope that the API may be simplified (or not even need to be called) in the - future. - - The frequency an application shoudl call optimize is based on the frequency of + The frequency an application should call optimize is based on the frequency of data modifications. If data is frequently added, deleted, or updated then optimize should be run frequently. A good rule of thumb is to run optimize if you have added or modified 100,000 or more records or run more than 20 data diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 3f88f8782..db0636a1c 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -951,17 +951,7 @@ impl Table { /// * Prune: Removes old versions of the dataset /// * Index: Optimizes the indices, adding new data to existing indices /// - ///
Experimental API
- /// - /// The optimization process is undergoing active development and may change. - /// Our goal with these changes is to improve the performance of optimization and - /// reduce the complexity. - /// - /// That being said, it is essential today to run optimize if you want the best - /// performance. It should be stable and safe to use in production, but it our - /// hope that the API may be simplified (or not even need to be called) in the future. - /// - /// The frequency an application shoudl call optimize is based on the frequency of + /// The frequency an application should call optimize is based on the frequency of /// data modifications. If data is frequently added, deleted, or updated then /// optimize should be run frequently. A good rule of thumb is to run optimize if /// you have added or modified 100,000 or more records or run more than 20 data diff --git a/rust/lancedb/src/table/optimize.rs b/rust/lancedb/src/table/optimize.rs index 3d9a7d476..1b2966faf 100644 --- a/rust/lancedb/src/table/optimize.rs +++ b/rust/lancedb/src/table/optimize.rs @@ -64,6 +64,9 @@ pub enum OptimizeAction { older_than: Option, /// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default. /// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`. + /// + /// **WARNING**: This should only be set to true if you can guarantee that no other process is + /// currently working on this dataset. Otherwise the dataset could be put into a corrupted state. delete_unverified: Option, /// If true, an error will be returned if there are any old versions that are still tagged. error_if_tagged_old_versions: Option, @@ -117,6 +120,10 @@ pub(crate) async fn optimize_indices(table: &NativeTable, options: &OptimizeOpti /// If you are sure that there are no in-progress transactions, then you /// can set this to True to delete all files older than `older_than`. /// +/// **WARNING**: This should only be set to true if you can guarantee that +/// no other process is currently working on this dataset. Otherwise the +/// dataset could be put into a corrupted state. +/// /// This calls into [lance::dataset::Dataset::cleanup_old_versions] and /// returns the result. pub(crate) async fn cleanup_old_versions(