feat: add create_index to the async python API (#1052)

This also refactors the rust lancedb index builder API (and, correspondingly, the nodejs API)
2026-06-01 03:10:43 +00:00 · 2024-03-12 05:17:05 -07:00
parent 90af5cf028
commit f822255683
38 changed files with 1329 additions and 766 deletions
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -18,15 +18,9 @@ import {
  ConnectionOptions,
 } from "./native.js";

-export {
-  ConnectionOptions,
-  WriteOptions,
-  Query,
-  MetricType,
-} from "./native.js";
-export { Connection } from "./connection";
-export { Table } from "./table";
-export { IvfPQOptions, IndexBuilder } from "./indexer";
+export { ConnectionOptions, WriteOptions, Query } from "./native.js";
+export { Connection, CreateTableOptions } from "./connection";
+export { Table, AddDataOptions } from "./table";

 /**
 * Connect to a LanceDB instance at the given URI.
--- a/nodejs/lancedb/indexer.ts
+++ b/nodejs/lancedb/indexer.ts
@@ -1,105 +0,0 @@
-// Copyright 2024 Lance Developers.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// TODO: Re-enable this as part of https://github.com/lancedb/lancedb/pull/1052
-/* eslint-disable @typescript-eslint/naming-convention */
-
-import {
-  MetricType,
-  IndexBuilder as NativeBuilder,
-  Table as NativeTable,
-} from "./native";
-
-/** Options to create `IVF_PQ` index */
-export interface IvfPQOptions {
-  /** Number of IVF partitions. */
-  num_partitions?: number;
-
-  /** Number of sub-vectors in PQ coding. */
-  num_sub_vectors?: number;
-
-  /** Number of bits used for each PQ code.
-   */
-  num_bits?: number;
-
-  /** Metric type to calculate the distance between vectors.
-   *
-   * Supported metrics: `L2`, `Cosine` and `Dot`.
-   */
-  metric_type?: MetricType;
-
-  /** Number of iterations to train K-means.
-   *
-   * Default is 50. The more iterations it usually yield better results,
-   * but it takes longer to train.
-   */
-  max_iterations?: number;
-
-  sample_rate?: number;
-}
-
-/**
- * Building an index on LanceDB {@link Table}
- *
- * @see {@link Table.createIndex} for detailed usage.
- */
-export class IndexBuilder {
-  private inner: NativeBuilder;
-
-  constructor(tbl: NativeTable) {
-    this.inner = tbl.createIndex();
-  }
-
-  /** Instruct the builder to build an `IVF_PQ` index */
-  ivf_pq(options?: IvfPQOptions): IndexBuilder {
-    this.inner.ivfPq(
-      options?.metric_type,
-      options?.num_partitions,
-      options?.num_sub_vectors,
-      options?.num_bits,
-      options?.max_iterations,
-      options?.sample_rate,
-    );
-    return this;
-  }
-
-  /** Instruct the builder to build a Scalar index. */
-  scalar(): IndexBuilder {
-    this.scalar();
-    return this;
-  }
-
-  /** Set the column(s) to create index on top of. */
-  column(col: string): IndexBuilder {
-    this.inner.column(col);
-    return this;
-  }
-
-  /** Set to true to replace existing index. */
-  replace(val: boolean): IndexBuilder {
-    this.inner.replace(val);
-    return this;
-  }
-
-  /** Specify the name of the index. Optional */
-  name(n: string): IndexBuilder {
-    this.inner.name(n);
-    return this;
-  }
-
-  /** Building the index. */
-  async build() {
-    await this.inner.build();
-  }
-}
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -0,0 +1,195 @@
+// Copyright 2024 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import { Index as LanceDbIndex } from "./native";
+
+/**
+ * Options to create an `IVF_PQ` index
+ */
+export interface IvfPqOptions {
+  /** The number of IVF partitions to create.
+   *
+   * This value should generally scale with the number of rows in the dataset.
+   * By default the number of partitions is the square root of the number of
+   * rows.
+   *
+   * If this value is too large then the first part of the search (picking the
+   * right partition) will be slow.  If this value is too small then the second
+   * part of the search (searching within a partition) will be slow.
+   */
+  numPartitions?: number;
+
+  /** Number of sub-vectors of PQ.
+   *
+   * This value controls how much the vector is compressed during the quantization step.
+   * The more sub vectors there are the less the vector is compressed.  The default is
+   * the dimension of the vector divided by 16.  If the dimension is not evenly divisible
+   * by 16 we use the dimension divded by 8.
+   *
+   * The above two cases are highly preferred.  Having 8 or 16 values per subvector allows
+   * us to use efficient SIMD instructions.
+   *
+   * If the dimension is not visible by 8 then we use 1 subvector.  This is not ideal and
+   * will likely result in poor performance.
+   */
+  numSubVectors?: number;
+
+  /** [DistanceType] to use to build the index.
+   *
+   * Default value is [DistanceType::L2].
+   *
+   * This is used when training the index to calculate the IVF partitions
+   * (vectors are grouped in partitions with similar vectors according to this
+   * distance type) and to calculate a subvector's code during quantization.
+   *
+   * The distance type used to train an index MUST match the distance type used
+   * to search the index.  Failure to do so will yield inaccurate results.
+   *
+   * The following distance types are available:
+   *
+   * "l2" - Euclidean distance. This is a very common distance metric that
+   * accounts for both magnitude and direction when determining the distance
+   * between vectors. L2 distance has a range of [0, ∞).
+   *
+   * "cosine" - Cosine distance.  Cosine distance is a distance metric
+   * calculated from the cosine similarity between two vectors. Cosine
+   * similarity is a measure of similarity between two non-zero vectors of an
+   * inner product space. It is defined to equal the cosine of the angle
+   * between them.  Unlike L2, the cosine distance is not affected by the
+   * magnitude of the vectors.  Cosine distance has a range of [0, 2].
+   *
+   * Note: the cosine distance is undefined when one (or both) of the vectors
+   * are all zeros (there is no direction).  These vectors are invalid and may
+   * never be returned from a vector search.
+   *
+   * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+   * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+   * L2 norm is 1), then dot distance is equivalent to the cosine distance.
+   */
+  distanceType?: "l2" | "cosine" | "dot";
+
+  /** Max iteration to train IVF kmeans.
+   *
+   * When training an IVF PQ index we use kmeans to calculate the partitions.  This parameter
+   * controls how many iterations of kmeans to run.
+   *
+   * Increasing this might improve the quality of the index but in most cases these extra
+   * iterations have diminishing returns.
+   *
+   * The default value is 50.
+   */
+  maxIterations?: number;
+
+  /** The number of vectors, per partition, to sample when training IVF kmeans.
+   *
+   * When an IVF PQ index is trained, we need to calculate partitions.  These are groups
+   * of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+   *
+   * Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+   * random sample of the data.  This parameter controls the size of the sample.  The total
+   * number of vectors used to train the index is `sample_rate * num_partitions`.
+   *
+   * Increasing this value might improve the quality of the index but in most cases the
+   * default should be sufficient.
+   *
+   * The default value is 256.
+   */
+  sampleRate?: number;
+}
+
+export class Index {
+  private readonly inner: LanceDbIndex;
+  private constructor(inner: LanceDbIndex) {
+    this.inner = inner;
+  }
+
+  /**
+   * Create an IvfPq index
+   *
+   * This index stores a compressed (quantized) copy of every vector.  These vectors
+   * are grouped into partitions of similar vectors.  Each partition keeps track of
+   * a centroid which is the average value of all vectors in the group.
+   *
+   * During a query the centroids are compared with the query vector to find the closest
+   * partitions.  The compressed vectors in these partitions are then searched to find
+   * the closest vectors.
+   *
+   * The compression scheme is called product quantization.  Each vector is divided into
+   * subvectors and then each subvector is quantized into a small number of bits.  the
+   * parameters `num_bits` and `num_subvectors` control this process, providing a tradeoff
+   * between index size (and thus search speed) and index accuracy.
+   *
+   * The partitioning process is called IVF and the `num_partitions` parameter controls how
+   * many groups to create.
+   *
+   * Note that training an IVF PQ index on a large dataset is a slow operation and
+   * currently is also a memory intensive operation.
+   */
+  static ivfPq(options?: Partial<IvfPqOptions>) {
+    return new Index(
+      LanceDbIndex.ivfPq(
+        options?.distanceType,
+        options?.numPartitions,
+        options?.numSubVectors,
+        options?.maxIterations,
+        options?.sampleRate,
+      ),
+    );
+  }
+
+  /** Create a btree index
+   *
+   * A btree index is an index on a scalar columns.  The index stores a copy of the column
+   * in sorted order.  A header entry is created for each block of rows (currently the
+   * block size is fixed at 4096).  These header entries are stored in a separate
+   * cacheable structure (a btree).  To search for data the header is used to determine
+   * which blocks need to be read from disk.
+   *
+   * For example, a btree index in a table with 1Bi rows requires sizeof(Scalar) * 256Ki
+   * bytes of memory and will generally need to read sizeof(Scalar) * 4096 bytes to find
+   * the correct row ids.
+   *
+   * This index is good for scalar columns with mostly distinct values and does best when
+   * the query is highly selective.
+   *
+   * The btree index does not currently have any parameters though parameters such as the
+   * block size may be added in the future.
+   */
+  static btree() {
+    return new Index(LanceDbIndex.btree());
+  }
+}
+
+export interface IndexOptions {
+  /** Advanced index configuration
+   *
+   * This option allows you to specify a specfic index to create and also
+   * allows you to pass in configuration for training the index.
+   *
+   * See the static methods on Index for details on the various index types.
+   *
+   * If this is not supplied then column data type(s) and column statistics
+   * will be used to determine the most useful kind of index to create.
+   */
+  config?: Index;
+  /** Whether to replace the existing index
+   *
+   * If this is false, and another index already exists on the same columns
+   * and the same name, then an error will be returned.  This is true even if
+   * that index is out of date.
+   *
+   * The default is true
+   */
+  replace?: boolean;
+}
--- a/nodejs/lancedb/native.d.ts
+++ b/nodejs/lancedb/native.d.ts
@@ -3,15 +3,6 @@

 /* auto-generated by NAPI-RS */

-export const enum IndexType {
-  Scalar = 0,
-  IvfPq = 1
-}
-export const enum MetricType {
-  L2 = 0,
-  Cosine = 1,
-  Dot = 2
-}
 /**
 *  A definition of a column alteration. The alteration changes the column at
 * `path` to have the new name `name`, to be nullable if `nullable` is true,
@@ -93,13 +84,9 @@ export class Connection {
  /** Drop table with the name. Or raise an error if the table does not exist. */
  dropTable(name: string): Promise<void>
 }
-export class IndexBuilder {
-  replace(v: boolean): void
-  column(c: string): void
-  name(name: string): void
-  ivfPq(metricType?: MetricType | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): void
-  scalar(): void
-  build(): Promise<void>
+export class Index {
+  static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
+  static btree(): Index
 }
 /** Typescript-style Async Iterator over RecordBatches  */
 export class RecordBatchIterator {
@@ -125,7 +112,7 @@ export class Table {
  add(buf: Buffer, mode: string): Promise<void>
  countRows(filter?: string | undefined | null): Promise<number>
  delete(predicate: string): Promise<void>
-  createIndex(): IndexBuilder
+  createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
  query(): Query
  addColumns(transforms: Array<AddColumnsSql>): Promise<void>
  alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
--- a/nodejs/lancedb/native.js
+++ b/nodejs/lancedb/native.js
@@ -295,12 +295,10 @@ if (!nativeBinding) {
  throw new Error(`Failed to load native binding`)
 }

-const { Connection, IndexType, MetricType, IndexBuilder, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding
+const { Connection, Index, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding

 module.exports.Connection = Connection
-module.exports.IndexType = IndexType
-module.exports.MetricType = MetricType
-module.exports.IndexBuilder = IndexBuilder
+module.exports.Index = Index
 module.exports.RecordBatchIterator = RecordBatchIterator
 module.exports.Query = Query
 module.exports.Table = Table
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -19,7 +19,7 @@ import {
  Table as _NativeTable,
 } from "./native";
 import { Query } from "./query";
-import { IndexBuilder } from "./indexer";
+import { IndexOptions } from "./indices";
 import { Data, fromDataToBuffer } from "./arrow";

 /**
@@ -103,24 +103,28 @@ export class Table {
    await this.inner.delete(predicate);
  }

-  /** Create an index over the columns.
+  /** Create an index to speed up queries.
   *
-   * @param {string} column The column to create the index on. If not specified,
-   *                        it will create an index on vector field.
+   * Indices can be created on vector columns or scalar columns.
+   * Indices on vector columns will speed up vector searches.
+   * Indices on scalar columns will speed up filtering (in both
+   * vector and non-vector searches)
   *
   * @example
   *
-   * By default, it creates vector idnex on one vector column.
+   * If the column has a vector (fixed size list) data type then
+   * an IvfPq vector index will be created.
   *
   * ```typescript
   * const table = await conn.openTable("my_table");
-   * await table.createIndex().build();
+   * await table.createIndex(["vector"]);
   * ```
   *
-   * You can specify `IVF_PQ` parameters via `ivf_pq({})` call.
+   * For advanced control over vector index creation you can specify
+   * the index type and options.
   * ```typescript
   * const table = await conn.openTable("my_table");
-   * await table.createIndex("my_vec_col")
+   * await table.createIndex(["vector"], I)
   *   .ivf_pq({ num_partitions: 128, num_sub_vectors: 16 })
   *   .build();
   * ```
@@ -131,12 +135,11 @@ export class Table {
   * await table.createIndex("my_float_col").build();
   * ```
   */
-  createIndex(column?: string): IndexBuilder {
-    let builder = new IndexBuilder(this.inner);
-    if (column !== undefined) {
-      builder = builder.column(column);
-    }
-    return builder;
+  async createIndex(column: string, options?: Partial<IndexOptions>) {
+    // Bit of a hack to get around the fact that TS has no package-scope.
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const nativeIndex = (options?.config as any)?.inner;
+    await this.inner.createIndex(nativeIndex, column, options?.replace);
  }

  /**