Files
lancedb/nodejs/lancedb/table.ts
2026-05-18 09:53:20 -07:00

1189 lines
40 KiB
TypeScript

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
import {
Table as ArrowTable,
Data,
DataType,
Field,
IntoVector,
MultiVector,
Schema,
dataTypeToJson,
fromDataToBuffer,
fromTableToBuffer,
isMultiVector,
makeEmptyTable,
tableFromIPC,
} from "./arrow";
import { EmbeddingFunctionConfig, getRegistry } from "./embedding/registry";
import { IndexOptions } from "./indices";
import { MergeInsertBuilder } from "./merge";
import {
AddColumnsResult,
AddColumnsSql,
AddResult,
AlterColumnsResult,
DeleteResult,
DropColumnsResult,
IndexConfig,
IndexStatistics,
OptimizeStats,
TableStatistics,
Tags,
UpdateResult,
Table as _NativeTable,
} from "./native";
import {
FullTextQuery,
Query,
TakeQuery,
VectorQuery,
instanceOfFullTextQuery,
} from "./query";
import { sanitizeType } from "./sanitize";
import { IntoSql, toSQL } from "./util";
export { IndexConfig } from "./native";
/**
* Progress snapshot for a write operation, delivered to the `progress`
* callback passed to {@link Table.add}.
*/
export interface WriteProgress {
/** Number of rows written so far. */
outputRows: number;
/** Number of bytes written so far. */
outputBytes: number;
/**
* Total rows expected, when the input source reports it.
*
* Always set on the final callback (the one with `done: true`), falling
* back to the actual number of rows written when the source could not
* report a row count up front.
*/
totalRows?: number;
/** Wall-clock seconds since the write started. */
elapsedSeconds: number;
/** Number of parallel write tasks currently in flight. */
activeTasks: number;
/** Total number of parallel write tasks (the write parallelism). */
totalTasks: number;
/** `true` for the final callback; `false` otherwise. */
done: boolean;
}
/**
* Options for adding data to a table.
*/
export interface AddDataOptions {
/**
* If "append" (the default) then the new data will be added to the table
*
* If "overwrite" then the new data will replace the existing data in the table.
*/
mode: "append" | "overwrite";
/**
* Optional callback invoked periodically with write progress.
*
* The callback is fired once per batch written and once more with
* `done: true` when the write completes. Calls are dispatched
* asynchronously to the JS event loop and never block the write — a slow
* callback will queue events rather than back-pressure the writer.
*
* Errors thrown from the callback are logged with `console.warn` and
* swallowed — they do not abort the write.
*
* @example
* ```ts
* await table.add(data, {
* progress: (p) => {
* console.log(`${p.outputRows}/${p.totalRows ?? "?"} rows`);
* },
* });
* ```
*/
progress: (progress: WriteProgress) => void;
}
export interface UpdateOptions {
/**
* A filter that limits the scope of the update.
*
* This should be an SQL filter expression.
*
* Only rows that satisfy the expression will be updated.
*
* For example, this could be 'my_col == 0' to replace all instances
* of 0 in a column with some other default value.
*/
where: string;
}
export interface OptimizeOptions {
/**
* If set then all versions older than the given date
* be removed. The current version will never be removed.
* The default is 7 days
* @example
* // Delete all versions older than 1 day
* const olderThan = new Date();
* olderThan.setDate(olderThan.getDate() - 1));
* tbl.optimize({cleanupOlderThan: olderThan});
*
* // Delete all versions except the current version
* tbl.optimize({cleanupOlderThan: new Date()});
*/
cleanupOlderThan: Date;
/**
* Because they may be part of an in-progress transaction, files newer than
* 7 days old are not deleted by default. If you are sure that there are no
* in-progress transactions, then you can set this to true to delete all
* files older than `cleanupOlderThan`.
*
* **WARNING**: This should only be set to true if you can guarantee that
* no other process is currently working on this dataset. Otherwise the
* dataset could be put into a corrupted state.
*/
deleteUnverified: boolean;
}
export interface Version {
version: number;
timestamp: Date;
metadata: Record<string, string>;
}
/**
* Specification selecting Lance's MemWAL LSM-style write path for
* `mergeInsert`.
*
* `specType` is `"bucket"`, `"identity"`, or `"unsharded"`. For `"bucket"`,
* `column` and `numBuckets` are required; for `"identity"`, `column` is
* required.
*/
export interface LsmWriteSpec {
/** One of `"bucket"`, `"identity"`, or `"unsharded"`. */
specType: "bucket" | "identity" | "unsharded";
/** Bucket and identity variants: the sharding column. */
column?: string;
/** Bucket variant: the number of buckets, in `[1, 1024]`. */
numBuckets?: number;
/** Names of indexes the MemWAL should keep up to date during writes. */
maintainedIndexes?: string[];
/** Default `ShardWriter` configuration recorded in the MemWAL index. */
writerConfigDefaults?: Record<string, string>;
}
/**
* A Table is a collection of Records in a LanceDB Database.
*
* A Table object is expected to be long lived and reused for multiple operations.
* Table objects will cache a certain amount of index data in memory. This cache
* will be freed when the Table is garbage collected. To eagerly free the cache you
* can call the `close` method. Once the Table is closed, it cannot be used for any
* further operations.
*
* Tables are created using the methods {@link Connection#createTable}
* and {@link Connection#createEmptyTable}. Existing tables are opened
* using {@link Connection#openTable}.
*
* Closing a table is optional. It not closed, it will be closed when it is garbage
* collected.
*
* @hideconstructor
*/
export abstract class Table {
[Symbol.for("nodejs.util.inspect.custom")](): string {
return this.display();
}
/** Returns the name of the table */
abstract get name(): string;
/** Return true if the table has not been closed */
abstract isOpen(): boolean;
/**
* Close the table, releasing any underlying resources.
*
* It is safe to call this method multiple times.
*
* Any attempt to use the table after it is closed will result in an error.
*/
abstract close(): void;
/** Return a brief description of the table */
abstract display(): string;
/** Get the schema of the table. */
abstract schema(): Promise<Schema>;
/**
* Insert records into this Table.
* @param {Data} data Records to be inserted into the Table
* @returns {Promise<AddResult>} A promise that resolves to an object
* containing the new version number of the table
*/
abstract add(
data: Data,
options?: Partial<AddDataOptions>,
): Promise<AddResult>;
/**
* Update existing records in the Table
* @param opts.values The values to update. The keys are the column names and the values
* are the values to set.
* @returns {Promise<UpdateResult>} A promise that resolves to an object containing
* the number of rows updated and the new version number
* @example
* ```ts
* table.update({where:"x = 2", values:{"vector": [10, 10]}})
* ```
*/
abstract update(
opts: {
values: Map<string, IntoSql> | Record<string, IntoSql>;
} & Partial<UpdateOptions>,
): Promise<UpdateResult>;
/**
* Update existing records in the Table
* @param opts.valuesSql The values to update. The keys are the column names and the values
* are the values to set. The values are SQL expressions.
* @returns {Promise<UpdateResult>} A promise that resolves to an object containing
* the number of rows updated and the new version number
* @example
* ```ts
* table.update({where:"x = 2", valuesSql:{"x": "x + 1"}})
* ```
*/
abstract update(
opts: {
valuesSql: Map<string, string> | Record<string, string>;
} & Partial<UpdateOptions>,
): Promise<UpdateResult>;
/**
* Update existing records in the Table
*
* An update operation can be used to adjust existing values. Use the
* returned builder to specify which columns to update. The new value
* can be a literal value (e.g. replacing nulls with some default value)
* or an expression applied to the old value (e.g. incrementing a value)
*
* An optional condition can be specified (e.g. "only update if the old
* value is 0")
*
* Note: if your condition is something like "some_id_column == 7" and
* you are updating many rows (with different ids) then you will get
* better performance with a single [`merge_insert`] call instead of
* repeatedly calilng this method.
* @param {Map<string, string> | Record<string, string>} updates - the
* columns to update
* @returns {Promise<UpdateResult>} A promise that resolves to an object
* containing the number of rows updated and the new version number
*
* Keys in the map should specify the name of the column to update.
* Values in the map provide the new value of the column. These can
* be SQL literal strings (e.g. "7" or "'foo'") or they can be expressions
* based on the row being updated (e.g. "my_col + 1")
* @param {Partial<UpdateOptions>} options - additional options to control
* the update behavior
*/
abstract update(
updates: Map<string, string> | Record<string, string>,
options?: Partial<UpdateOptions>,
): Promise<UpdateResult>;
/** Count the total number of rows in the dataset. */
abstract countRows(filter?: string): Promise<number>;
/**
* Delete the rows that satisfy the predicate.
* @returns {Promise<DeleteResult>} A promise that resolves to an object
* containing the new version number of the table
*/
abstract delete(predicate: string): Promise<DeleteResult>;
/**
* Create an index to speed up queries.
*
* Indices can be created on vector columns or scalar columns.
* Indices on vector columns will speed up vector searches.
* Indices on scalar columns will speed up filtering (in both
* vector and non-vector searches)
*
* We currently don't support custom named indexes.
* The index name will always be `${column}_idx`.
*
* @example
* // If the column has a vector (fixed size list) data type then
* // an IvfPq vector index will be created.
* const table = await conn.openTable("my_table");
* await table.createIndex("vector");
* @example
* // For advanced control over vector index creation you can specify
* // the index type and options.
* const table = await conn.openTable("my_table");
* await table.createIndex("vector", {
* config: lancedb.Index.ivfPq({
* numPartitions: 128,
* numSubVectors: 16,
* }),
* });
* @example
* // Or create a Scalar index
* await table.createIndex("my_float_col");
*/
abstract createIndex(
column: string,
options?: Partial<IndexOptions>,
): Promise<void>;
/**
* Drop an index from the table.
*
* @param name The name of the index.
*
* This does not delete the index from disk, it just removes it from the table.
* To delete the index, run {@link Table#optimize} after dropping the index.
*
* Use {@link Table.listIndices} to find the names of the indices.
*/
abstract dropIndex(name: string): Promise<void>;
/**
* Prewarm an index in the table.
*
* @param name The name of the index.
*
* This will load the index into memory. This may reduce the cold-start time for
* future queries. If the index does not fit in the cache then this call may be
* wasteful.
*/
abstract prewarmIndex(name: string): Promise<void>;
/**
* Prewarm one or more columns of data in the table.
*
* @param columns The columns to prewarm. If undefined, all columns are prewarmed.
*
* This will load the column data into the page cache so that future queries that
* read those columns avoid the initial cold-start latency. This call initiates
* prewarming and returns once the request is accepted; the warming itself may
* continue in the background. Calling it on already-prewarmed columns is a
* no-op on the server.
*
* Prewarming is generally useful for columns used in filters or projections.
* Large columns (e.g. high-dimensional vectors or binary data) may not be
* practical to prewarm.
*
* This feature is currently only supported on remote tables.
*/
abstract prewarmData(columns?: string[]): Promise<void>;
/**
* Waits for asynchronous indexing to complete on the table.
*
* @param indexNames The name of the indices to wait for
* @param timeoutSeconds The number of seconds to wait before timing out
*
* This will raise an error if the indices are not created and fully indexed within the timeout.
*/
abstract waitForIndex(
indexNames: string[],
timeoutSeconds: number,
): Promise<void>;
/**
* Create a {@link Query} Builder.
*
* Queries allow you to search your existing data. By default the query will
* return all the data in the table in no particular order. The builder
* returned by this method can be used to control the query using filtering,
* vector similarity, sorting, and more.
*
* Note: By default, all columns are returned. For best performance, you should
* only fetch the columns you need.
*
* When appropriate, various indices and statistics based pruning will be used to
* accelerate the query.
* @example
* // SQL-style filtering
* //
* // This query will return up to 1000 rows whose value in the `id` column
* // is greater than 5. LanceDb supports a broad set of filtering functions.
* for await (const batch of table
* .query()
* .where("id > 1")
* .select(["id"])
* .limit(20)) {
* console.log(batch);
* }
* @example
* // Vector Similarity Search
* //
* // This example will find the 10 rows whose value in the "vector" column are
* // closest to the query vector [1.0, 2.0, 3.0]. If an index has been created
* // on the "vector" column then this will perform an ANN search.
* //
* // The `refineFactor` and `nprobes` methods are used to control the recall /
* // latency tradeoff of the search.
* for await (const batch of table
* .query()
* .where("id > 1")
* .select(["id"])
* .limit(20)) {
* console.log(batch);
* }
* @example
* // Scan the full dataset
* //
* // This query will return everything in the table in no particular order.
* for await (const batch of table.query()) {
* console.log(batch);
* }
* @returns {Query} A builder that can be used to parameterize the query
*/
abstract query(): Query;
/**
* Create a query that returns a subset of the rows in the table.
* @param offsets The offsets of the rows to return.
* @returns A builder that can be used to parameterize the query.
*/
abstract takeOffsets(offsets: number[]): TakeQuery;
/**
* Create a query that returns a subset of the rows in the table.
* @param rowIds The row ids of the rows to return.
*
* Row ids returned by `withRowId()` are `bigint`, so `bigint[]` is supported.
* For convenience / backwards compatibility, `number[]` is also accepted (for
* small row ids that fit in a safe integer).
* @returns A builder that can be used to parameterize the query.
*/
abstract takeRowIds(rowIds: readonly (bigint | number)[]): TakeQuery;
/**
* Create a search query to find the nearest neighbors
* of the given query
* @param {string | IntoVector} query - the query, a vector or string
* @param {string} queryType - the type of the query, "vector", "fts", or "auto"
* @param {string | string[]} ftsColumns - the columns to search in for full text search
* for now, only one column can be searched at a time.
*
* when "auto" is used, if the query is a string and an embedding function is defined, it will be treated as a vector query
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
*/
abstract search(
query: string | IntoVector | MultiVector | FullTextQuery,
queryType?: string,
ftsColumns?: string | string[],
): VectorQuery | Query;
/**
* Search the table with a given query vector.
*
* This is a convenience method for preparing a vector query and
* is the same thing as calling `nearestTo` on the builder returned
* by `query`. @see {@link Query#nearestTo} for more details.
*/
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
/**
* Add new columns with defined values.
* @param {AddColumnsSql[] | Field | Field[] | Schema} newColumnTransforms Either:
* - An array of objects with column names and SQL expressions to calculate values
* - A single Arrow Field defining one column with its data type (column will be initialized with null values)
* - An array of Arrow Fields defining columns with their data types (columns will be initialized with null values)
* - An Arrow Schema defining columns with their data types (columns will be initialized with null values)
* @returns {Promise<AddColumnsResult>} A promise that resolves to an object
* containing the new version number of the table after adding the columns.
*/
abstract addColumns(
newColumnTransforms: AddColumnsSql[] | Field | Field[] | Schema,
): Promise<AddColumnsResult>;
/**
* Alter the name or nullability of columns.
* @param {ColumnAlteration[]} columnAlterations One or more alterations to
* apply to columns.
* @returns {Promise<AlterColumnsResult>} A promise that resolves to an object
* containing the new version number of the table after altering the columns.
*/
abstract alterColumns(
columnAlterations: ColumnAlteration[],
): Promise<AlterColumnsResult>;
/**
* Drop one or more columns from the dataset
*
* This is a metadata-only operation and does not remove the data from the
* underlying storage. In order to remove the data, you must subsequently
* call ``compact_files`` to rewrite the data without the removed columns and
* then call ``cleanup_files`` to remove the old files.
* @param {string[]} columnNames The names of the columns to drop. These can
* be nested column references (e.g. "a.b.c") or top-level column names
* (e.g. "a").
* @returns {Promise<DropColumnsResult>} A promise that resolves to an object
* containing the new version number of the table after dropping the columns.
*/
abstract dropColumns(columnNames: string[]): Promise<DropColumnsResult>;
/**
* Set the unenforced primary key for this table to a single column.
*
* "Unenforced" means LanceDB does not check uniqueness on writes; the
* column is recorded in the schema as the primary key for use by features
* such as `merge_insert`. Only single-column primary keys are supported,
* and the key cannot be changed once set.
* @param {string | string[]} columns The primary key column. A one-element
* array is also accepted; passing more than one column is rejected.
* @returns {Promise<void>}
*/
abstract setUnenforcedPrimaryKey(columns: string | string[]): Promise<void>;
/**
* Install an {@link LsmWriteSpec} on this table, selecting Lance's MemWAL
* LSM-style write path for future `mergeInsert` calls.
*
* `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
*
* - `"bucket"` — hash-bucket writes by the single-column unenforced primary
* key (`column` and `numBuckets` required).
* - `"identity"` — shard by the raw value of a scalar `column`.
* - `"unsharded"` — route every write to a single shard.
*
* All variants require the table to have an unenforced primary key
* ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
* requires it to be the single column being bucketed.
* @param {LsmWriteSpec} spec The sharding spec to install.
* @returns {Promise<void>}
* @example
* ```ts
* await table.setUnenforcedPrimaryKey("id");
* await table.setLsmWriteSpec({
* specType: "bucket",
* column: "id",
* numBuckets: 16,
* maintainedIndexes: ["id_idx"],
* });
* ```
*/
abstract setLsmWriteSpec(spec: LsmWriteSpec): Promise<void>;
/**
* Remove the {@link LsmWriteSpec} from this table, reverting to the standard
* `mergeInsert` write path.
*
* Errors if no spec is currently set.
* @returns {Promise<void>}
*/
abstract unsetLsmWriteSpec(): Promise<void>;
/** Retrieve the version of the table */
abstract version(): Promise<number>;
/**
* Checks out a specific version of the table _This is an in-place operation._
*
* This allows viewing previous versions of the table. If you wish to
* keep writing to the dataset starting from an old version, then use
* the `restore` function.
*
* Calling this method will set the table into time-travel mode. If you
* wish to return to standard mode, call `checkoutLatest`.
* @param {number | string} version The version to checkout, could be version number or tag
* @example
* ```typescript
* import * as lancedb from "@lancedb/lancedb"
* const db = await lancedb.connect("./.lancedb");
* const table = await db.createTable("my_table", [
* { vector: [1.1, 0.9], type: "vector" },
* ]);
*
* console.log(await table.version()); // 1
* console.log(table.display());
* await table.add([{ vector: [0.5, 0.2], type: "vector" }]);
* await table.checkout(1);
* console.log(await table.version()); // 2
* ```
*/
abstract checkout(version: number | string): Promise<void>;
/**
* Checkout the latest version of the table. _This is an in-place operation._
*
* The table will be set back into standard mode, and will track the latest
* version of the table.
*/
abstract checkoutLatest(): Promise<void>;
/**
* List all the versions of the table
*/
abstract listVersions(): Promise<Version[]>;
/**
* Get a tags manager for this table.
*
* Tags allow you to label specific versions of a table with a human-readable name.
* The returned tags manager can be used to list, create, update, or delete tags.
*
* @returns {Tags} A tags manager for this table
* @example
* ```typescript
* const tagsManager = await table.tags();
* await tagsManager.create("v1", 1);
* const tags = await tagsManager.list();
* console.log(tags); // { "v1": { version: 1, manifestSize: ... } }
* ```
*/
abstract tags(): Promise<Tags>;
/**
* Restore the table to the currently checked out version
*
* This operation will fail if checkout has not been called previously
*
* This operation will overwrite the latest version of the table with a
* previous version. Any changes made since the checked out version will
* no longer be visible.
*
* Once the operation concludes the table will no longer be in a checked
* out state and the read_consistency_interval, if any, will apply.
*/
abstract restore(): Promise<void>;
/**
* Optimize the on-disk data and indices for better performance.
*
* Modeled after ``VACUUM`` in PostgreSQL.
*
* Optimization covers three operations:
*
* - Compaction: Merges small files into larger ones
* - Prune: Removes old versions of the dataset
* - Index: Optimizes the indices, adding new data to existing indices
*
*
* The frequency an application should call optimize is based on the frequency of
* data modifications. If data is frequently added, deleted, or updated then
* optimize should be run frequently. A good rule of thumb is to run optimize if
* you have added or modified 100,000 or more records or run more than 20 data
* modification operations.
*/
abstract optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats>;
/** List all indices that have been created with {@link Table.createIndex} */
abstract listIndices(): Promise<IndexConfig[]>;
/** Return the table as an arrow table */
abstract toArrow(): Promise<ArrowTable>;
abstract mergeInsert(on: string | string[]): MergeInsertBuilder;
/** List all the stats of a specified index
*
* @param {string} name The name of the index.
* @returns {IndexStatistics | undefined} The stats of the index. If the index does not exist, it will return undefined
*
* Use {@link Table.listIndices} to find the names of the indices.
*/
abstract indexStats(name: string): Promise<IndexStatistics | undefined>;
/** Returns table and fragment statistics
*
* @returns {TableStatistics} The table and fragment statistics
*
*/
abstract stats(): Promise<TableStatistics>;
/**
* Get the initial storage options that were passed in when opening this table.
*
* For dynamically refreshed options (e.g., credential vending), use
* {@link Table.latestStorageOptions}.
*
* Warning: This is an internal API and the return value is subject to change.
*
* @returns The storage options, or undefined if no storage options were configured.
*/
abstract initialStorageOptions(): Promise<
Record<string, string> | null | undefined
>;
/**
* Get the latest storage options, refreshing from provider if configured.
*
* This method is useful for credential vending scenarios where storage options
* may be refreshed dynamically. If no dynamic provider is configured, this
* returns the initial static options.
*
* Warning: This is an internal API and the return value is subject to change.
*
* @returns The storage options, or undefined if no storage options were configured.
*/
abstract latestStorageOptions(): Promise<
Record<string, string> | null | undefined
>;
}
export class LocalTable extends Table {
private readonly inner: _NativeTable;
constructor(inner: _NativeTable) {
super();
this.inner = inner;
}
get name(): string {
return this.inner.name;
}
isOpen(): boolean {
return this.inner.isOpen();
}
close(): void {
this.inner.close();
}
display(): string {
return this.inner.display();
}
private async getEmbeddingFunctions(): Promise<
Map<string, EmbeddingFunctionConfig>
> {
const schema = await this.schema();
const registry = getRegistry();
return registry.parseFunctions(schema.metadata);
}
/** Get the schema of the table. */
async schema(): Promise<Schema> {
const schemaBuf = await this.inner.schema();
const tbl = tableFromIPC(schemaBuf);
return tbl.schema;
}
async add(data: Data, options?: Partial<AddDataOptions>): Promise<AddResult> {
const mode = options?.mode ?? "append";
const schema = await this.schema();
const buffer = await fromDataToBuffer(data, undefined, schema);
// Wrap the user callback so a thrown error doesn't surface as an
// unhandled exception (the callback fires from a napi threadsafe
// function — exceptions there crash the process).
const userProgress = options?.progress;
const progress = userProgress
? (p: WriteProgress) => {
try {
userProgress(p);
} catch (e) {
console.warn("Table.add progress callback threw:", e);
}
}
: undefined;
return await this.inner.add(buffer, mode, progress);
}
async update(
optsOrUpdates:
| (Map<string, string> | Record<string, string>)
| ({
values: Map<string, IntoSql> | Record<string, IntoSql>;
} & Partial<UpdateOptions>)
| ({
valuesSql: Map<string, string> | Record<string, string>;
} & Partial<UpdateOptions>),
options?: Partial<UpdateOptions>,
): Promise<UpdateResult> {
const isValues =
"values" in optsOrUpdates && typeof optsOrUpdates.values !== "string";
const isValuesSql =
"valuesSql" in optsOrUpdates &&
typeof optsOrUpdates.valuesSql !== "string";
const isMap = (obj: unknown): obj is Map<string, string> => {
return obj instanceof Map;
};
let predicate;
let columns: [string, string][];
switch (true) {
case isMap(optsOrUpdates):
columns = Array.from(optsOrUpdates.entries());
predicate = options?.where;
break;
case isValues && isMap(optsOrUpdates.values):
columns = Array.from(optsOrUpdates.values.entries()).map(([k, v]) => [
k,
toSQL(v),
]);
predicate = optsOrUpdates.where;
break;
case isValues && !isMap(optsOrUpdates.values):
columns = Object.entries(optsOrUpdates.values).map(([k, v]) => [
k,
toSQL(v),
]);
predicate = optsOrUpdates.where;
break;
case isValuesSql && isMap(optsOrUpdates.valuesSql):
columns = Array.from(optsOrUpdates.valuesSql.entries());
predicate = optsOrUpdates.where;
break;
case isValuesSql && !isMap(optsOrUpdates.valuesSql):
columns = Object.entries(optsOrUpdates.valuesSql).map(([k, v]) => [
k,
v,
]);
predicate = optsOrUpdates.where;
break;
default:
columns = Object.entries(optsOrUpdates as Record<string, string>);
predicate = options?.where;
}
return await this.inner.update(predicate, columns);
}
async countRows(filter?: string): Promise<number> {
return await this.inner.countRows(filter);
}
async delete(predicate: string): Promise<DeleteResult> {
return await this.inner.delete(predicate);
}
async createIndex(column: string, options?: Partial<IndexOptions>) {
// Bit of a hack to get around the fact that TS has no package-scope.
// biome-ignore lint/suspicious/noExplicitAny: skip
const nativeIndex = (options?.config as any)?.inner;
await this.inner.createIndex(
nativeIndex,
column,
options?.replace,
options?.waitTimeoutSeconds,
options?.name,
options?.train,
);
}
async dropIndex(name: string): Promise<void> {
await this.inner.dropIndex(name);
}
async prewarmIndex(name: string): Promise<void> {
await this.inner.prewarmIndex(name);
}
async prewarmData(columns?: string[]): Promise<void> {
await this.inner.prewarmData(columns);
}
async waitForIndex(
indexNames: string[],
timeoutSeconds: number,
): Promise<void> {
await this.inner.waitForIndex(indexNames, timeoutSeconds);
}
takeOffsets(offsets: number[]): TakeQuery {
return new TakeQuery(this.inner.takeOffsets(offsets));
}
takeRowIds(rowIds: readonly (bigint | number)[]): TakeQuery {
const ids = rowIds.map((id) => {
if (typeof id === "bigint") {
return id;
}
if (!Number.isInteger(id)) {
throw new Error("Row id must be an integer (or bigint)");
}
if (id < 0) {
throw new Error("Row id cannot be negative");
}
if (!Number.isSafeInteger(id)) {
throw new Error("Row id is too large for number; use bigint instead");
}
return BigInt(id);
});
return new TakeQuery(this.inner.takeRowIds(ids));
}
query(): Query {
return new Query(this.inner);
}
search(
query: string | IntoVector | MultiVector | FullTextQuery,
queryType: string = "auto",
ftsColumns?: string | string[],
): VectorQuery | Query {
if (typeof query !== "string" && !instanceOfFullTextQuery(query)) {
if (queryType === "fts") {
throw new Error("Cannot perform full text search on a vector query");
}
return this.vectorSearch(query);
}
// If the query is a string, we need to determine if it is a vector query or a full text search query
if (queryType === "fts") {
return this.query().fullTextSearch(query, {
columns: ftsColumns,
});
}
// The query type is auto or vector
// fall back to full text search if no embedding functions are defined and the query is a string
if (
queryType === "auto" &&
(getRegistry().length() === 0 || instanceOfFullTextQuery(query))
) {
return this.query().fullTextSearch(query, {
columns: ftsColumns,
});
}
const queryPromise = this.getEmbeddingFunctions().then(
async (functions) => {
// TODO: Support multiple embedding functions
const embeddingFunc: EmbeddingFunctionConfig | undefined = functions
.values()
.next().value;
if (!embeddingFunc) {
return Promise.reject(
new Error("No embedding functions are defined in the table"),
);
}
return await embeddingFunc.function.computeQueryEmbeddings(query);
},
);
return this.query().nearestTo(queryPromise);
}
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
if (isMultiVector(vector)) {
const query = this.query().nearestTo(vector[0]);
for (const v of vector.slice(1)) {
query.addQueryVector(v);
}
return query;
}
return this.query().nearestTo(vector);
}
// TODO: Support BatchUDF
async addColumns(
newColumnTransforms: AddColumnsSql[] | Field | Field[] | Schema,
): Promise<AddColumnsResult> {
// Handle single Field -> convert to array of Fields
if (newColumnTransforms instanceof Field) {
newColumnTransforms = [newColumnTransforms];
}
// Handle array of Fields -> convert to Schema
if (
Array.isArray(newColumnTransforms) &&
newColumnTransforms.length > 0 &&
newColumnTransforms[0] instanceof Field
) {
const fields = newColumnTransforms as Field[];
newColumnTransforms = new Schema(fields);
}
// Handle Schema -> use schema-based approach
if (newColumnTransforms instanceof Schema) {
const schema = newColumnTransforms;
// Convert schema to buffer using Arrow IPC format
const emptyTable = makeEmptyTable(schema);
const schemaBuf = await fromTableToBuffer(emptyTable);
return await this.inner.addColumnsWithSchema(schemaBuf);
}
// Handle SQL expressions (existing functionality)
if (Array.isArray(newColumnTransforms)) {
return await this.inner.addColumns(
newColumnTransforms as AddColumnsSql[],
);
}
throw new Error("Invalid input type for addColumns");
}
async alterColumns(
columnAlterations: ColumnAlteration[],
): Promise<AlterColumnsResult> {
const processedAlterations = columnAlterations.map((alteration) => {
if (typeof alteration.dataType === "string") {
return {
...alteration,
dataType: JSON.stringify({ type: alteration.dataType }),
};
} else if (alteration.dataType === undefined) {
return {
...alteration,
dataType: undefined,
};
} else {
const dataType = sanitizeType(alteration.dataType);
return {
...alteration,
dataType: JSON.stringify(dataTypeToJson(dataType)),
};
}
});
return await this.inner.alterColumns(processedAlterations);
}
async dropColumns(columnNames: string[]): Promise<DropColumnsResult> {
return await this.inner.dropColumns(columnNames);
}
async setUnenforcedPrimaryKey(columns: string | string[]): Promise<void> {
const cols = typeof columns === "string" ? [columns] : columns;
return await this.inner.setUnenforcedPrimaryKey(cols);
}
async setLsmWriteSpec(spec: LsmWriteSpec): Promise<void> {
return await this.inner.setLsmWriteSpec(spec);
}
async unsetLsmWriteSpec(): Promise<void> {
return await this.inner.unsetLsmWriteSpec();
}
async version(): Promise<number> {
return await this.inner.version();
}
async checkout(version: number | string): Promise<void> {
if (typeof version === "string") {
return this.inner.checkoutTag(version);
}
return this.inner.checkout(version);
}
async checkoutLatest(): Promise<void> {
await this.inner.checkoutLatest();
}
async listVersions(): Promise<Version[]> {
return (await this.inner.listVersions()).map((version) => ({
version: version.version,
timestamp: new Date(version.timestamp / 1000),
metadata: version.metadata,
}));
}
async restore(): Promise<void> {
await this.inner.restore();
}
async tags(): Promise<Tags> {
return await this.inner.tags();
}
async optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats> {
let cleanupOlderThanMs;
if (
options?.cleanupOlderThan !== undefined &&
options?.cleanupOlderThan !== null
) {
cleanupOlderThanMs =
new Date().getTime() - options.cleanupOlderThan.getTime();
}
return await this.inner.optimize(
cleanupOlderThanMs,
options?.deleteUnverified,
);
}
async listIndices(): Promise<IndexConfig[]> {
return await this.inner.listIndices();
}
async toArrow(): Promise<ArrowTable> {
return await this.query().toArrow();
}
async indexStats(name: string): Promise<IndexStatistics | undefined> {
const stats = await this.inner.indexStats(name);
if (stats === null) {
return undefined;
}
return stats;
}
async stats(): Promise<TableStatistics> {
return await this.inner.stats();
}
async initialStorageOptions(): Promise<
Record<string, string> | null | undefined
> {
return await this.inner.initialStorageOptions();
}
async latestStorageOptions(): Promise<
Record<string, string> | null | undefined
> {
return await this.inner.latestStorageOptions();
}
mergeInsert(on: string | string[]): MergeInsertBuilder {
on = Array.isArray(on) ? on : [on];
return new MergeInsertBuilder(this.inner.mergeInsert(on), this.schema());
}
/**
* Check if the table uses the new manifest path scheme.
*
* This function will return true if the table uses the V2 manifest
* path scheme.
*/
async usesV2ManifestPaths(): Promise<boolean> {
return await this.inner.usesV2ManifestPaths();
}
/**
* Migrate the table to use the new manifest path scheme.
*
* This function will rename all V1 manifests to V2 manifest paths.
* These paths provide more efficient opening of datasets with many versions
* on object stores.
*
* This function is idempotent, and can be run multiple times without
* changing the state of the object store.
*
* However, it should not be run while other concurrent operations are happening.
* And it should also run until completion before resuming other operations.
*/
async migrateManifestPathsV2(): Promise<void> {
await this.inner.migrateManifestPathsV2();
}
}
/**
* A definition of a column alteration. The alteration changes the column at
* `path` to have the new name `name`, to be nullable if `nullable` is true,
* and to have the data type `data_type`. At least one of `rename` or `nullable`
* must be provided.
*/
export interface ColumnAlteration {
/**
* The path to the column to alter. This is a dot-separated path to the column.
* If it is a top-level column then it is just the name of the column. If it is
* a nested column then it is the path to the column, e.g. "a.b.c" for a column
* `c` nested inside a column `b` nested inside a column `a`.
*/
path: string;
/**
* The new name of the column. If not provided then the name will not be changed.
* This must be distinct from the names of all other columns in the table.
*/
rename?: string;
/**
* A new data type for the column. If not provided then the data type will not be changed.
* Changing data types is limited to casting to the same general type. For example, these
* changes are valid:
* * `int32` -> `int64` (integers)
* * `double` -> `float` (floats)
* * `string` -> `large_string` (strings)
* But these changes are not:
* * `int32` -> `double` (mix integers and floats)
* * `string` -> `int32` (mix strings and integers)
*/
dataType?: string | DataType;
/** Set the new nullability. Note that a nullable column cannot be made non-nullable. */
nullable?: boolean;
}