mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-15 08:12:58 +00:00
The optimize function is pretty crucial for getting good performance when building a large scale dataset but it was only exposed in rust (many sync python users are probably doing this via to_lance today) This PR adds the optimize function to nodejs and to python. I left the function marked experimental because I think there will likely be changes to optimization (e.g. if we add features like "optimize on write"). I also only exposed the `cleanup_older_than` configuration parameter since this one is very commonly used and the rest have sensible defaults and we don't really know why we would recommend different values for these defaults anyways.
420 lines
14 KiB
TypeScript
420 lines
14 KiB
TypeScript
// Copyright 2024 Lance Developers.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
import { Schema, tableFromIPC } from "apache-arrow";
|
|
import { Data, fromDataToBuffer } from "./arrow";
|
|
import { IndexOptions } from "./indices";
|
|
import {
|
|
AddColumnsSql,
|
|
ColumnAlteration,
|
|
IndexConfig,
|
|
OptimizeStats,
|
|
Table as _NativeTable,
|
|
} from "./native";
|
|
import { Query, VectorQuery } from "./query";
|
|
|
|
export { IndexConfig } from "./native";
|
|
/**
|
|
* Options for adding data to a table.
|
|
*/
|
|
export interface AddDataOptions {
|
|
/**
|
|
* If "append" (the default) then the new data will be added to the table
|
|
*
|
|
* If "overwrite" then the new data will replace the existing data in the table.
|
|
*/
|
|
mode: "append" | "overwrite";
|
|
}
|
|
|
|
export interface UpdateOptions {
|
|
/**
|
|
* A filter that limits the scope of the update.
|
|
*
|
|
* This should be an SQL filter expression.
|
|
*
|
|
* Only rows that satisfy the expression will be updated.
|
|
*
|
|
* For example, this could be 'my_col == 0' to replace all instances
|
|
* of 0 in a column with some other default value.
|
|
*/
|
|
where: string;
|
|
}
|
|
|
|
export interface OptimizeOptions {
|
|
/**
|
|
* If set then all versions older than the given date
|
|
* be removed. The current version will never be removed.
|
|
* The default is 7 days
|
|
* @example
|
|
* // Delete all versions older than 1 day
|
|
* const olderThan = new Date();
|
|
* olderThan.setDate(olderThan.getDate() - 1));
|
|
* tbl.cleanupOlderVersions(olderThan);
|
|
*
|
|
* // Delete all versions except the current version
|
|
* tbl.cleanupOlderVersions(new Date());
|
|
*/
|
|
cleanupOlderThan: Date;
|
|
}
|
|
|
|
/**
|
|
* A Table is a collection of Records in a LanceDB Database.
|
|
*
|
|
* A Table object is expected to be long lived and reused for multiple operations.
|
|
* Table objects will cache a certain amount of index data in memory. This cache
|
|
* will be freed when the Table is garbage collected. To eagerly free the cache you
|
|
* can call the `close` method. Once the Table is closed, it cannot be used for any
|
|
* further operations.
|
|
*
|
|
* Closing a table is optional. It not closed, it will be closed when it is garbage
|
|
* collected.
|
|
*/
|
|
export class Table {
|
|
private readonly inner: _NativeTable;
|
|
|
|
/** Construct a Table. Internal use only. */
|
|
constructor(inner: _NativeTable) {
|
|
this.inner = inner;
|
|
}
|
|
|
|
/** Return true if the table has not been closed */
|
|
isOpen(): boolean {
|
|
return this.inner.isOpen();
|
|
}
|
|
|
|
/**
|
|
* Close the table, releasing any underlying resources.
|
|
*
|
|
* It is safe to call this method multiple times.
|
|
*
|
|
* Any attempt to use the table after it is closed will result in an error.
|
|
*/
|
|
close(): void {
|
|
this.inner.close();
|
|
}
|
|
|
|
/** Return a brief description of the table */
|
|
display(): string {
|
|
return this.inner.display();
|
|
}
|
|
|
|
/** Get the schema of the table. */
|
|
async schema(): Promise<Schema> {
|
|
const schemaBuf = await this.inner.schema();
|
|
const tbl = tableFromIPC(schemaBuf);
|
|
return tbl.schema;
|
|
}
|
|
|
|
/**
|
|
* Insert records into this Table.
|
|
* @param {Data} data Records to be inserted into the Table
|
|
*/
|
|
async add(data: Data, options?: Partial<AddDataOptions>): Promise<void> {
|
|
const mode = options?.mode ?? "append";
|
|
|
|
const buffer = await fromDataToBuffer(data);
|
|
await this.inner.add(buffer, mode);
|
|
}
|
|
|
|
/**
|
|
* Update existing records in the Table
|
|
*
|
|
* An update operation can be used to adjust existing values. Use the
|
|
* returned builder to specify which columns to update. The new value
|
|
* can be a literal value (e.g. replacing nulls with some default value)
|
|
* or an expression applied to the old value (e.g. incrementing a value)
|
|
*
|
|
* An optional condition can be specified (e.g. "only update if the old
|
|
* value is 0")
|
|
*
|
|
* Note: if your condition is something like "some_id_column == 7" and
|
|
* you are updating many rows (with different ids) then you will get
|
|
* better performance with a single [`merge_insert`] call instead of
|
|
* repeatedly calilng this method.
|
|
* @param {Map<string, string> | Record<string, string>} updates - the
|
|
* columns to update
|
|
*
|
|
* Keys in the map should specify the name of the column to update.
|
|
* Values in the map provide the new value of the column. These can
|
|
* be SQL literal strings (e.g. "7" or "'foo'") or they can be expressions
|
|
* based on the row being updated (e.g. "my_col + 1")
|
|
* @param {Partial<UpdateOptions>} options - additional options to control
|
|
* the update behavior
|
|
*/
|
|
async update(
|
|
updates: Map<string, string> | Record<string, string>,
|
|
options?: Partial<UpdateOptions>,
|
|
) {
|
|
const onlyIf = options?.where;
|
|
let columns: [string, string][];
|
|
if (updates instanceof Map) {
|
|
columns = Array.from(updates.entries());
|
|
} else {
|
|
columns = Object.entries(updates);
|
|
}
|
|
await this.inner.update(onlyIf, columns);
|
|
}
|
|
|
|
/** Count the total number of rows in the dataset. */
|
|
async countRows(filter?: string): Promise<number> {
|
|
return await this.inner.countRows(filter);
|
|
}
|
|
|
|
/** Delete the rows that satisfy the predicate. */
|
|
async delete(predicate: string): Promise<void> {
|
|
await this.inner.delete(predicate);
|
|
}
|
|
|
|
/**
|
|
* Create an index to speed up queries.
|
|
*
|
|
* Indices can be created on vector columns or scalar columns.
|
|
* Indices on vector columns will speed up vector searches.
|
|
* Indices on scalar columns will speed up filtering (in both
|
|
* vector and non-vector searches)
|
|
* @example
|
|
* // If the column has a vector (fixed size list) data type then
|
|
* // an IvfPq vector index will be created.
|
|
* const table = await conn.openTable("my_table");
|
|
* await table.createIndex("vector");
|
|
* @example
|
|
* // For advanced control over vector index creation you can specify
|
|
* // the index type and options.
|
|
* const table = await conn.openTable("my_table");
|
|
* await table.createIndex("vector", {
|
|
* config: lancedb.Index.ivfPq({
|
|
* numPartitions: 128,
|
|
* numSubVectors: 16,
|
|
* }),
|
|
* });
|
|
* @example
|
|
* // Or create a Scalar index
|
|
* await table.createIndex("my_float_col");
|
|
*/
|
|
async createIndex(column: string, options?: Partial<IndexOptions>) {
|
|
// Bit of a hack to get around the fact that TS has no package-scope.
|
|
// biome-ignore lint/suspicious/noExplicitAny: skip
|
|
const nativeIndex = (options?.config as any)?.inner;
|
|
await this.inner.createIndex(nativeIndex, column, options?.replace);
|
|
}
|
|
|
|
/**
|
|
* Create a {@link Query} Builder.
|
|
*
|
|
* Queries allow you to search your existing data. By default the query will
|
|
* return all the data in the table in no particular order. The builder
|
|
* returned by this method can be used to control the query using filtering,
|
|
* vector similarity, sorting, and more.
|
|
*
|
|
* Note: By default, all columns are returned. For best performance, you should
|
|
* only fetch the columns you need.
|
|
*
|
|
* When appropriate, various indices and statistics based pruning will be used to
|
|
* accelerate the query.
|
|
* @example
|
|
* // SQL-style filtering
|
|
* //
|
|
* // This query will return up to 1000 rows whose value in the `id` column
|
|
* // is greater than 5. LanceDb supports a broad set of filtering functions.
|
|
* for await (const batch of table
|
|
* .query()
|
|
* .where("id > 1")
|
|
* .select(["id"])
|
|
* .limit(20)) {
|
|
* console.log(batch);
|
|
* }
|
|
* @example
|
|
* // Vector Similarity Search
|
|
* //
|
|
* // This example will find the 10 rows whose value in the "vector" column are
|
|
* // closest to the query vector [1.0, 2.0, 3.0]. If an index has been created
|
|
* // on the "vector" column then this will perform an ANN search.
|
|
* //
|
|
* // The `refineFactor` and `nprobes` methods are used to control the recall /
|
|
* // latency tradeoff of the search.
|
|
* for await (const batch of table
|
|
* .query()
|
|
* .where("id > 1")
|
|
* .select(["id"])
|
|
* .limit(20)) {
|
|
* console.log(batch);
|
|
* }
|
|
* @example
|
|
* // Scan the full dataset
|
|
* //
|
|
* // This query will return everything in the table in no particular order.
|
|
* for await (const batch of table.query()) {
|
|
* console.log(batch);
|
|
* }
|
|
* @returns {Query} A builder that can be used to parameterize the query
|
|
*/
|
|
query(): Query {
|
|
return new Query(this.inner);
|
|
}
|
|
|
|
/**
|
|
* Search the table with a given query vector.
|
|
*
|
|
* This is a convenience method for preparing a vector query and
|
|
* is the same thing as calling `nearestTo` on the builder returned
|
|
* by `query`. @see {@link Query#nearestTo} for more details.
|
|
*/
|
|
vectorSearch(vector: unknown): VectorQuery {
|
|
return this.query().nearestTo(vector);
|
|
}
|
|
|
|
// TODO: Support BatchUDF
|
|
/**
|
|
* Add new columns with defined values.
|
|
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
|
* the SQL expression to use to calculate the value of the new column. These
|
|
* expressions will be evaluated for each row in the table, and can
|
|
* reference existing columns in the table.
|
|
*/
|
|
async addColumns(newColumnTransforms: AddColumnsSql[]): Promise<void> {
|
|
await this.inner.addColumns(newColumnTransforms);
|
|
}
|
|
|
|
/**
|
|
* Alter the name or nullability of columns.
|
|
* @param {ColumnAlteration[]} columnAlterations One or more alterations to
|
|
* apply to columns.
|
|
*/
|
|
async alterColumns(columnAlterations: ColumnAlteration[]): Promise<void> {
|
|
await this.inner.alterColumns(columnAlterations);
|
|
}
|
|
|
|
/**
|
|
* Drop one or more columns from the dataset
|
|
*
|
|
* This is a metadata-only operation and does not remove the data from the
|
|
* underlying storage. In order to remove the data, you must subsequently
|
|
* call ``compact_files`` to rewrite the data without the removed columns and
|
|
* then call ``cleanup_files`` to remove the old files.
|
|
* @param {string[]} columnNames The names of the columns to drop. These can
|
|
* be nested column references (e.g. "a.b.c") or top-level column names
|
|
* (e.g. "a").
|
|
*/
|
|
async dropColumns(columnNames: string[]): Promise<void> {
|
|
await this.inner.dropColumns(columnNames);
|
|
}
|
|
|
|
/** Retrieve the version of the table */
|
|
async version(): Promise<number> {
|
|
return await this.inner.version();
|
|
}
|
|
|
|
/**
|
|
* Checks out a specific version of the table _This is an in-place operation._
|
|
*
|
|
* This allows viewing previous versions of the table. If you wish to
|
|
* keep writing to the dataset starting from an old version, then use
|
|
* the `restore` function.
|
|
*
|
|
* Calling this method will set the table into time-travel mode. If you
|
|
* wish to return to standard mode, call `checkoutLatest`.
|
|
* @param {number} version The version to checkout
|
|
* @example
|
|
* ```typescript
|
|
* import * as lancedb from "@lancedb/lancedb"
|
|
* const db = await lancedb.connect("./.lancedb");
|
|
* const table = await db.createTable("my_table", [
|
|
* { vector: [1.1, 0.9], type: "vector" },
|
|
* ]);
|
|
*
|
|
* console.log(await table.version()); // 1
|
|
* console.log(table.display());
|
|
* await table.add([{ vector: [0.5, 0.2], type: "vector" }]);
|
|
* await table.checkout(1);
|
|
* console.log(await table.version()); // 2
|
|
* ```
|
|
*/
|
|
async checkout(version: number): Promise<void> {
|
|
await this.inner.checkout(version);
|
|
}
|
|
|
|
/**
|
|
* Checkout the latest version of the table. _This is an in-place operation._
|
|
*
|
|
* The table will be set back into standard mode, and will track the latest
|
|
* version of the table.
|
|
*/
|
|
async checkoutLatest(): Promise<void> {
|
|
await this.inner.checkoutLatest();
|
|
}
|
|
|
|
/**
|
|
* Restore the table to the currently checked out version
|
|
*
|
|
* This operation will fail if checkout has not been called previously
|
|
*
|
|
* This operation will overwrite the latest version of the table with a
|
|
* previous version. Any changes made since the checked out version will
|
|
* no longer be visible.
|
|
*
|
|
* Once the operation concludes the table will no longer be in a checked
|
|
* out state and the read_consistency_interval, if any, will apply.
|
|
*/
|
|
async restore(): Promise<void> {
|
|
await this.inner.restore();
|
|
}
|
|
|
|
/**
|
|
* Optimize the on-disk data and indices for better performance.
|
|
*
|
|
* Modeled after ``VACUUM`` in PostgreSQL.
|
|
*
|
|
* Optimization covers three operations:
|
|
*
|
|
* - Compaction: Merges small files into larger ones
|
|
* - Prune: Removes old versions of the dataset
|
|
* - Index: Optimizes the indices, adding new data to existing indices
|
|
*
|
|
*
|
|
* Experimental API
|
|
* ----------------
|
|
*
|
|
* The optimization process is undergoing active development and may change.
|
|
* Our goal with these changes is to improve the performance of optimization and
|
|
* reduce the complexity.
|
|
*
|
|
* That being said, it is essential today to run optimize if you want the best
|
|
* performance. It should be stable and safe to use in production, but it our
|
|
* hope that the API may be simplified (or not even need to be called) in the
|
|
* future.
|
|
*
|
|
* The frequency an application shoudl call optimize is based on the frequency of
|
|
* data modifications. If data is frequently added, deleted, or updated then
|
|
* optimize should be run frequently. A good rule of thumb is to run optimize if
|
|
* you have added or modified 100,000 or more records or run more than 20 data
|
|
* modification operations.
|
|
*/
|
|
async optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats> {
|
|
let cleanupOlderThanMs;
|
|
if (
|
|
options?.cleanupOlderThan !== undefined &&
|
|
options?.cleanupOlderThan !== null
|
|
) {
|
|
cleanupOlderThanMs =
|
|
new Date().getTime() - options.cleanupOlderThan.getTime();
|
|
}
|
|
return await this.inner.optimize(cleanupOlderThanMs);
|
|
}
|
|
|
|
/** List all indices that have been created with {@link Table.createIndex} */
|
|
async listIndices(): Promise<IndexConfig[]> {
|
|
return await this.inner.listIndices();
|
|
}
|
|
}
|