mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-07 04:12:59 +00:00
feat: add create_index to the async python API (#1052)
This also refactors the rust lancedb index builder API (and, correspondingly, the nodejs API)
This commit is contained in:
@@ -18,15 +18,9 @@ import {
|
||||
ConnectionOptions,
|
||||
} from "./native.js";
|
||||
|
||||
export {
|
||||
ConnectionOptions,
|
||||
WriteOptions,
|
||||
Query,
|
||||
MetricType,
|
||||
} from "./native.js";
|
||||
export { Connection } from "./connection";
|
||||
export { Table } from "./table";
|
||||
export { IvfPQOptions, IndexBuilder } from "./indexer";
|
||||
export { ConnectionOptions, WriteOptions, Query } from "./native.js";
|
||||
export { Connection, CreateTableOptions } from "./connection";
|
||||
export { Table, AddDataOptions } from "./table";
|
||||
|
||||
/**
|
||||
* Connect to a LanceDB instance at the given URI.
|
||||
|
||||
@@ -1,105 +0,0 @@
|
||||
// Copyright 2024 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// TODO: Re-enable this as part of https://github.com/lancedb/lancedb/pull/1052
|
||||
/* eslint-disable @typescript-eslint/naming-convention */
|
||||
|
||||
import {
|
||||
MetricType,
|
||||
IndexBuilder as NativeBuilder,
|
||||
Table as NativeTable,
|
||||
} from "./native";
|
||||
|
||||
/** Options to create `IVF_PQ` index */
|
||||
export interface IvfPQOptions {
|
||||
/** Number of IVF partitions. */
|
||||
num_partitions?: number;
|
||||
|
||||
/** Number of sub-vectors in PQ coding. */
|
||||
num_sub_vectors?: number;
|
||||
|
||||
/** Number of bits used for each PQ code.
|
||||
*/
|
||||
num_bits?: number;
|
||||
|
||||
/** Metric type to calculate the distance between vectors.
|
||||
*
|
||||
* Supported metrics: `L2`, `Cosine` and `Dot`.
|
||||
*/
|
||||
metric_type?: MetricType;
|
||||
|
||||
/** Number of iterations to train K-means.
|
||||
*
|
||||
* Default is 50. The more iterations it usually yield better results,
|
||||
* but it takes longer to train.
|
||||
*/
|
||||
max_iterations?: number;
|
||||
|
||||
sample_rate?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Building an index on LanceDB {@link Table}
|
||||
*
|
||||
* @see {@link Table.createIndex} for detailed usage.
|
||||
*/
|
||||
export class IndexBuilder {
|
||||
private inner: NativeBuilder;
|
||||
|
||||
constructor(tbl: NativeTable) {
|
||||
this.inner = tbl.createIndex();
|
||||
}
|
||||
|
||||
/** Instruct the builder to build an `IVF_PQ` index */
|
||||
ivf_pq(options?: IvfPQOptions): IndexBuilder {
|
||||
this.inner.ivfPq(
|
||||
options?.metric_type,
|
||||
options?.num_partitions,
|
||||
options?.num_sub_vectors,
|
||||
options?.num_bits,
|
||||
options?.max_iterations,
|
||||
options?.sample_rate,
|
||||
);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Instruct the builder to build a Scalar index. */
|
||||
scalar(): IndexBuilder {
|
||||
this.scalar();
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Set the column(s) to create index on top of. */
|
||||
column(col: string): IndexBuilder {
|
||||
this.inner.column(col);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Set to true to replace existing index. */
|
||||
replace(val: boolean): IndexBuilder {
|
||||
this.inner.replace(val);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Specify the name of the index. Optional */
|
||||
name(n: string): IndexBuilder {
|
||||
this.inner.name(n);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Building the index. */
|
||||
async build() {
|
||||
await this.inner.build();
|
||||
}
|
||||
}
|
||||
195
nodejs/lancedb/indices.ts
Normal file
195
nodejs/lancedb/indices.ts
Normal file
@@ -0,0 +1,195 @@
|
||||
// Copyright 2024 Lance Developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import { Index as LanceDbIndex } from "./native";
|
||||
|
||||
/**
|
||||
* Options to create an `IVF_PQ` index
|
||||
*/
|
||||
export interface IvfPqOptions {
|
||||
/** The number of IVF partitions to create.
|
||||
*
|
||||
* This value should generally scale with the number of rows in the dataset.
|
||||
* By default the number of partitions is the square root of the number of
|
||||
* rows.
|
||||
*
|
||||
* If this value is too large then the first part of the search (picking the
|
||||
* right partition) will be slow. If this value is too small then the second
|
||||
* part of the search (searching within a partition) will be slow.
|
||||
*/
|
||||
numPartitions?: number;
|
||||
|
||||
/** Number of sub-vectors of PQ.
|
||||
*
|
||||
* This value controls how much the vector is compressed during the quantization step.
|
||||
* The more sub vectors there are the less the vector is compressed. The default is
|
||||
* the dimension of the vector divided by 16. If the dimension is not evenly divisible
|
||||
* by 16 we use the dimension divded by 8.
|
||||
*
|
||||
* The above two cases are highly preferred. Having 8 or 16 values per subvector allows
|
||||
* us to use efficient SIMD instructions.
|
||||
*
|
||||
* If the dimension is not visible by 8 then we use 1 subvector. This is not ideal and
|
||||
* will likely result in poor performance.
|
||||
*/
|
||||
numSubVectors?: number;
|
||||
|
||||
/** [DistanceType] to use to build the index.
|
||||
*
|
||||
* Default value is [DistanceType::L2].
|
||||
*
|
||||
* This is used when training the index to calculate the IVF partitions
|
||||
* (vectors are grouped in partitions with similar vectors according to this
|
||||
* distance type) and to calculate a subvector's code during quantization.
|
||||
*
|
||||
* The distance type used to train an index MUST match the distance type used
|
||||
* to search the index. Failure to do so will yield inaccurate results.
|
||||
*
|
||||
* The following distance types are available:
|
||||
*
|
||||
* "l2" - Euclidean distance. This is a very common distance metric that
|
||||
* accounts for both magnitude and direction when determining the distance
|
||||
* between vectors. L2 distance has a range of [0, ∞).
|
||||
*
|
||||
* "cosine" - Cosine distance. Cosine distance is a distance metric
|
||||
* calculated from the cosine similarity between two vectors. Cosine
|
||||
* similarity is a measure of similarity between two non-zero vectors of an
|
||||
* inner product space. It is defined to equal the cosine of the angle
|
||||
* between them. Unlike L2, the cosine distance is not affected by the
|
||||
* magnitude of the vectors. Cosine distance has a range of [0, 2].
|
||||
*
|
||||
* Note: the cosine distance is undefined when one (or both) of the vectors
|
||||
* are all zeros (there is no direction). These vectors are invalid and may
|
||||
* never be returned from a vector search.
|
||||
*
|
||||
* "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
|
||||
* distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||
* L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||
*/
|
||||
distanceType?: "l2" | "cosine" | "dot";
|
||||
|
||||
/** Max iteration to train IVF kmeans.
|
||||
*
|
||||
* When training an IVF PQ index we use kmeans to calculate the partitions. This parameter
|
||||
* controls how many iterations of kmeans to run.
|
||||
*
|
||||
* Increasing this might improve the quality of the index but in most cases these extra
|
||||
* iterations have diminishing returns.
|
||||
*
|
||||
* The default value is 50.
|
||||
*/
|
||||
maxIterations?: number;
|
||||
|
||||
/** The number of vectors, per partition, to sample when training IVF kmeans.
|
||||
*
|
||||
* When an IVF PQ index is trained, we need to calculate partitions. These are groups
|
||||
* of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||
*
|
||||
* Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||
* random sample of the data. This parameter controls the size of the sample. The total
|
||||
* number of vectors used to train the index is `sample_rate * num_partitions`.
|
||||
*
|
||||
* Increasing this value might improve the quality of the index but in most cases the
|
||||
* default should be sufficient.
|
||||
*
|
||||
* The default value is 256.
|
||||
*/
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
export class Index {
|
||||
private readonly inner: LanceDbIndex;
|
||||
private constructor(inner: LanceDbIndex) {
|
||||
this.inner = inner;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an IvfPq index
|
||||
*
|
||||
* This index stores a compressed (quantized) copy of every vector. These vectors
|
||||
* are grouped into partitions of similar vectors. Each partition keeps track of
|
||||
* a centroid which is the average value of all vectors in the group.
|
||||
*
|
||||
* During a query the centroids are compared with the query vector to find the closest
|
||||
* partitions. The compressed vectors in these partitions are then searched to find
|
||||
* the closest vectors.
|
||||
*
|
||||
* The compression scheme is called product quantization. Each vector is divided into
|
||||
* subvectors and then each subvector is quantized into a small number of bits. the
|
||||
* parameters `num_bits` and `num_subvectors` control this process, providing a tradeoff
|
||||
* between index size (and thus search speed) and index accuracy.
|
||||
*
|
||||
* The partitioning process is called IVF and the `num_partitions` parameter controls how
|
||||
* many groups to create.
|
||||
*
|
||||
* Note that training an IVF PQ index on a large dataset is a slow operation and
|
||||
* currently is also a memory intensive operation.
|
||||
*/
|
||||
static ivfPq(options?: Partial<IvfPqOptions>) {
|
||||
return new Index(
|
||||
LanceDbIndex.ivfPq(
|
||||
options?.distanceType,
|
||||
options?.numPartitions,
|
||||
options?.numSubVectors,
|
||||
options?.maxIterations,
|
||||
options?.sampleRate,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/** Create a btree index
|
||||
*
|
||||
* A btree index is an index on a scalar columns. The index stores a copy of the column
|
||||
* in sorted order. A header entry is created for each block of rows (currently the
|
||||
* block size is fixed at 4096). These header entries are stored in a separate
|
||||
* cacheable structure (a btree). To search for data the header is used to determine
|
||||
* which blocks need to be read from disk.
|
||||
*
|
||||
* For example, a btree index in a table with 1Bi rows requires sizeof(Scalar) * 256Ki
|
||||
* bytes of memory and will generally need to read sizeof(Scalar) * 4096 bytes to find
|
||||
* the correct row ids.
|
||||
*
|
||||
* This index is good for scalar columns with mostly distinct values and does best when
|
||||
* the query is highly selective.
|
||||
*
|
||||
* The btree index does not currently have any parameters though parameters such as the
|
||||
* block size may be added in the future.
|
||||
*/
|
||||
static btree() {
|
||||
return new Index(LanceDbIndex.btree());
|
||||
}
|
||||
}
|
||||
|
||||
export interface IndexOptions {
|
||||
/** Advanced index configuration
|
||||
*
|
||||
* This option allows you to specify a specfic index to create and also
|
||||
* allows you to pass in configuration for training the index.
|
||||
*
|
||||
* See the static methods on Index for details on the various index types.
|
||||
*
|
||||
* If this is not supplied then column data type(s) and column statistics
|
||||
* will be used to determine the most useful kind of index to create.
|
||||
*/
|
||||
config?: Index;
|
||||
/** Whether to replace the existing index
|
||||
*
|
||||
* If this is false, and another index already exists on the same columns
|
||||
* and the same name, then an error will be returned. This is true even if
|
||||
* that index is out of date.
|
||||
*
|
||||
* The default is true
|
||||
*/
|
||||
replace?: boolean;
|
||||
}
|
||||
21
nodejs/lancedb/native.d.ts
vendored
21
nodejs/lancedb/native.d.ts
vendored
@@ -3,15 +3,6 @@
|
||||
|
||||
/* auto-generated by NAPI-RS */
|
||||
|
||||
export const enum IndexType {
|
||||
Scalar = 0,
|
||||
IvfPq = 1
|
||||
}
|
||||
export const enum MetricType {
|
||||
L2 = 0,
|
||||
Cosine = 1,
|
||||
Dot = 2
|
||||
}
|
||||
/**
|
||||
* A definition of a column alteration. The alteration changes the column at
|
||||
* `path` to have the new name `name`, to be nullable if `nullable` is true,
|
||||
@@ -93,13 +84,9 @@ export class Connection {
|
||||
/** Drop table with the name. Or raise an error if the table does not exist. */
|
||||
dropTable(name: string): Promise<void>
|
||||
}
|
||||
export class IndexBuilder {
|
||||
replace(v: boolean): void
|
||||
column(c: string): void
|
||||
name(name: string): void
|
||||
ivfPq(metricType?: MetricType | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): void
|
||||
scalar(): void
|
||||
build(): Promise<void>
|
||||
export class Index {
|
||||
static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
|
||||
static btree(): Index
|
||||
}
|
||||
/** Typescript-style Async Iterator over RecordBatches */
|
||||
export class RecordBatchIterator {
|
||||
@@ -125,7 +112,7 @@ export class Table {
|
||||
add(buf: Buffer, mode: string): Promise<void>
|
||||
countRows(filter?: string | undefined | null): Promise<number>
|
||||
delete(predicate: string): Promise<void>
|
||||
createIndex(): IndexBuilder
|
||||
createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
|
||||
query(): Query
|
||||
addColumns(transforms: Array<AddColumnsSql>): Promise<void>
|
||||
alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
|
||||
|
||||
@@ -295,12 +295,10 @@ if (!nativeBinding) {
|
||||
throw new Error(`Failed to load native binding`)
|
||||
}
|
||||
|
||||
const { Connection, IndexType, MetricType, IndexBuilder, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding
|
||||
const { Connection, Index, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding
|
||||
|
||||
module.exports.Connection = Connection
|
||||
module.exports.IndexType = IndexType
|
||||
module.exports.MetricType = MetricType
|
||||
module.exports.IndexBuilder = IndexBuilder
|
||||
module.exports.Index = Index
|
||||
module.exports.RecordBatchIterator = RecordBatchIterator
|
||||
module.exports.Query = Query
|
||||
module.exports.Table = Table
|
||||
|
||||
@@ -19,7 +19,7 @@ import {
|
||||
Table as _NativeTable,
|
||||
} from "./native";
|
||||
import { Query } from "./query";
|
||||
import { IndexBuilder } from "./indexer";
|
||||
import { IndexOptions } from "./indices";
|
||||
import { Data, fromDataToBuffer } from "./arrow";
|
||||
|
||||
/**
|
||||
@@ -103,24 +103,28 @@ export class Table {
|
||||
await this.inner.delete(predicate);
|
||||
}
|
||||
|
||||
/** Create an index over the columns.
|
||||
/** Create an index to speed up queries.
|
||||
*
|
||||
* @param {string} column The column to create the index on. If not specified,
|
||||
* it will create an index on vector field.
|
||||
* Indices can be created on vector columns or scalar columns.
|
||||
* Indices on vector columns will speed up vector searches.
|
||||
* Indices on scalar columns will speed up filtering (in both
|
||||
* vector and non-vector searches)
|
||||
*
|
||||
* @example
|
||||
*
|
||||
* By default, it creates vector idnex on one vector column.
|
||||
* If the column has a vector (fixed size list) data type then
|
||||
* an IvfPq vector index will be created.
|
||||
*
|
||||
* ```typescript
|
||||
* const table = await conn.openTable("my_table");
|
||||
* await table.createIndex().build();
|
||||
* await table.createIndex(["vector"]);
|
||||
* ```
|
||||
*
|
||||
* You can specify `IVF_PQ` parameters via `ivf_pq({})` call.
|
||||
* For advanced control over vector index creation you can specify
|
||||
* the index type and options.
|
||||
* ```typescript
|
||||
* const table = await conn.openTable("my_table");
|
||||
* await table.createIndex("my_vec_col")
|
||||
* await table.createIndex(["vector"], I)
|
||||
* .ivf_pq({ num_partitions: 128, num_sub_vectors: 16 })
|
||||
* .build();
|
||||
* ```
|
||||
@@ -131,12 +135,11 @@ export class Table {
|
||||
* await table.createIndex("my_float_col").build();
|
||||
* ```
|
||||
*/
|
||||
createIndex(column?: string): IndexBuilder {
|
||||
let builder = new IndexBuilder(this.inner);
|
||||
if (column !== undefined) {
|
||||
builder = builder.column(column);
|
||||
}
|
||||
return builder;
|
||||
async createIndex(column: string, options?: Partial<IndexOptions>) {
|
||||
// Bit of a hack to get around the fact that TS has no package-scope.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const nativeIndex = (options?.config as any)?.inner;
|
||||
await this.inner.createIndex(nativeIndex, column, options?.replace);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user