feat: add create_index to the async python API (#1052)

This also refactors the rust lancedb index builder API (and, correspondingly, the nodejs API)
2026-01-04 19:02:58 +00:00 · 2024-03-12 05:17:05 -07:00
parent 90af5cf028
commit f822255683
38 changed files with 1329 additions and 766 deletions
--- a/nodejs/Cargo.toml
+++ b/nodejs/Cargo.toml
@@ -14,12 +14,10 @@ crate-type = ["cdylib"]
 [dependencies]
 arrow-ipc.workspace = true
 futures.workspace = true
-lance-linalg.workspace = true
-lance.workspace = true
 lancedb = { path = "../rust/lancedb" }
 napi = { version = "2.15", default-features = false, features = [
    "napi7",
-    "async"
+    "async",
 ] }
 napi-derive = "2"

--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -27,6 +27,7 @@ import {
  Float64,
 } from "apache-arrow";
 import { makeArrowTable } from "../dist/arrow";
+import { Index } from "../dist/indices";

 describe("Given a table", () => {
  let tmpDir: tmp.DirResult;
@@ -67,19 +68,17 @@ describe("Given a table", () => {
  });
 });

-describe("Test creating index", () => {
+describe("When creating an index", () => {
  let tmpDir: tmp.DirResult;
  const schema = new Schema([
    new Field("id", new Int32(), true),
    new Field("vec", new FixedSizeList(32, new Field("item", new Float32()))),
  ]);
+  let tbl: Table;
+  let queryVec: number[];

-  beforeEach(() => {
+  beforeEach(async () => {
    tmpDir = tmp.dirSync({ unsafeCleanup: true });
-  });
-  afterEach(() => tmpDir.removeCallback());
-
-  test("create vector index with no column", async () => {
    const db = await connect(tmpDir.name);
    const data = makeArrowTable(
      Array(300)
@@ -94,8 +93,13 @@ describe("Test creating index", () => {
        schema,
      },
    );
-    const tbl = await db.createTable("test", data);
-    await tbl.createIndex().build();
+    queryVec = data.toArray()[5].vec.toJSON();
+    tbl = await db.createTable("test", data);
+  });
+  afterEach(() => tmpDir.removeCallback());
+
+  it("should create a vector index on vector columns", async () => {
+    await tbl.createIndex("vec");

    // check index directory
    const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
@@ -103,38 +107,47 @@ describe("Test creating index", () => {
    // TODO: check index type.

    // Search without specifying the column
-    const queryVector = data.toArray()[5].vec.toJSON();
-    const rst = await tbl.query().nearestTo(queryVector).limit(2).toArrow();
+    const rst = await tbl.query().nearestTo(queryVec).limit(2).toArrow();
    expect(rst.numRows).toBe(2);

    // Search with specifying the column
-    const rst2 = await tbl.search(queryVector, "vec").limit(2).toArrow();
+    const rst2 = await tbl.search(queryVec, "vec").limit(2).toArrow();
    expect(rst2.numRows).toBe(2);
    expect(rst.toString()).toEqual(rst2.toString());
  });

-  test("no vector column available", async () => {
-    const db = await connect(tmpDir.name);
-    const tbl = await db.createTable(
-      "no_vec",
-      makeArrowTable([
-        { id: 1, val: 2 },
-        { id: 2, val: 3 },
-      ]),
-    );
-    await expect(tbl.createIndex().build()).rejects.toThrow(
-      "No vector column found",
-    );
+  it("should allow parameters to be specified", async () => {
+    await tbl.createIndex("vec", {
+      config: Index.ivfPq({
+        numPartitions: 10,
+      }),
+    });

-    await tbl.createIndex("val").build();
-    const indexDir = path.join(tmpDir.name, "no_vec.lance", "_indices");
+    // TODO: Verify parameters when we can load index config as part of list indices
+  });
+
+  it("should allow me to replace (or not) an existing index", async () => {
+    await tbl.createIndex("id");
+    // Default is replace=true
+    await tbl.createIndex("id");
+    await expect(tbl.createIndex("id", { replace: false })).rejects.toThrow(
+      "already exists",
+    );
+    await tbl.createIndex("id", { replace: true });
+  });
+
+  test("should create a scalar index on scalar columns", async () => {
+    await tbl.createIndex("id");
+    const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
    expect(fs.readdirSync(indexDir)).toHaveLength(1);

    for await (const r of tbl.query().filter("id > 1").select(["id"])) {
-      expect(r.numRows).toBe(1);
+      expect(r.numRows).toBe(298);
    }
  });

+  // TODO: Move this test to the query API test (making sure we can reject queries
+  // when the dimension is incorrect)
  test("two columns with different dimensions", async () => {
    const db = await connect(tmpDir.name);
    const schema = new Schema([
@@ -164,14 +177,9 @@ describe("Test creating index", () => {
    );

    // Only build index over v1
-    await expect(tbl.createIndex().build()).rejects.toThrow(
-      /.*More than one vector columns found.*/,
-    );
-    tbl
-      .createIndex("vec")
-      // eslint-disable-next-line @typescript-eslint/naming-convention
-      .ivf_pq({ num_partitions: 2, num_sub_vectors: 2 })
-      .build();
+    await tbl.createIndex("vec", {
+      config: Index.ivfPq({ numPartitions: 2, numSubVectors: 2 }),
+    });

    const rst = await tbl
      .query()
@@ -205,30 +213,6 @@ describe("Test creating index", () => {
    expect(rst64Query.toString()).toEqual(rst64Search.toString());
    expect(rst64Query.numRows).toBe(2);
  });
-
-  test("create scalar index", async () => {
-    const db = await connect(tmpDir.name);
-    const data = makeArrowTable(
-      Array(300)
-        .fill(1)
-        .map((_, i) => ({
-          id: i,
-          vec: Array(32)
-            .fill(1)
-            .map(() => Math.random()),
-        })),
-      {
-        schema,
-      },
-    );
-    const tbl = await db.createTable("test", data);
-    await tbl.createIndex("id").build();
-
-    // check index directory
-    const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
-    expect(fs.readdirSync(indexDir)).toHaveLength(1);
-    // TODO: check index type.
-  });
 });

 describe("Read consistency interval", () => {
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -18,15 +18,9 @@ import {
  ConnectionOptions,
 } from "./native.js";

-export {
-  ConnectionOptions,
-  WriteOptions,
-  Query,
-  MetricType,
-} from "./native.js";
-export { Connection } from "./connection";
-export { Table } from "./table";
-export { IvfPQOptions, IndexBuilder } from "./indexer";
+export { ConnectionOptions, WriteOptions, Query } from "./native.js";
+export { Connection, CreateTableOptions } from "./connection";
+export { Table, AddDataOptions } from "./table";

 /**
 * Connect to a LanceDB instance at the given URI.
--- a/nodejs/lancedb/indexer.ts
+++ b/nodejs/lancedb/indexer.ts
@@ -1,105 +0,0 @@
-// Copyright 2024 Lance Developers.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// TODO: Re-enable this as part of https://github.com/lancedb/lancedb/pull/1052
-/* eslint-disable @typescript-eslint/naming-convention */
-
-import {
-  MetricType,
-  IndexBuilder as NativeBuilder,
-  Table as NativeTable,
-} from "./native";
-
-/** Options to create `IVF_PQ` index */
-export interface IvfPQOptions {
-  /** Number of IVF partitions. */
-  num_partitions?: number;
-
-  /** Number of sub-vectors in PQ coding. */
-  num_sub_vectors?: number;
-
-  /** Number of bits used for each PQ code.
-   */
-  num_bits?: number;
-
-  /** Metric type to calculate the distance between vectors.
-   *
-   * Supported metrics: `L2`, `Cosine` and `Dot`.
-   */
-  metric_type?: MetricType;
-
-  /** Number of iterations to train K-means.
-   *
-   * Default is 50. The more iterations it usually yield better results,
-   * but it takes longer to train.
-   */
-  max_iterations?: number;
-
-  sample_rate?: number;
-}
-
-/**
- * Building an index on LanceDB {@link Table}
- *
- * @see {@link Table.createIndex} for detailed usage.
- */
-export class IndexBuilder {
-  private inner: NativeBuilder;
-
-  constructor(tbl: NativeTable) {
-    this.inner = tbl.createIndex();
-  }
-
-  /** Instruct the builder to build an `IVF_PQ` index */
-  ivf_pq(options?: IvfPQOptions): IndexBuilder {
-    this.inner.ivfPq(
-      options?.metric_type,
-      options?.num_partitions,
-      options?.num_sub_vectors,
-      options?.num_bits,
-      options?.max_iterations,
-      options?.sample_rate,
-    );
-    return this;
-  }
-
-  /** Instruct the builder to build a Scalar index. */
-  scalar(): IndexBuilder {
-    this.scalar();
-    return this;
-  }
-
-  /** Set the column(s) to create index on top of. */
-  column(col: string): IndexBuilder {
-    this.inner.column(col);
-    return this;
-  }
-
-  /** Set to true to replace existing index. */
-  replace(val: boolean): IndexBuilder {
-    this.inner.replace(val);
-    return this;
-  }
-
-  /** Specify the name of the index. Optional */
-  name(n: string): IndexBuilder {
-    this.inner.name(n);
-    return this;
-  }
-
-  /** Building the index. */
-  async build() {
-    await this.inner.build();
-  }
-}
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -0,0 +1,195 @@
+// Copyright 2024 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import { Index as LanceDbIndex } from "./native";
+
+/**
+ * Options to create an `IVF_PQ` index
+ */
+export interface IvfPqOptions {
+  /** The number of IVF partitions to create.
+   *
+   * This value should generally scale with the number of rows in the dataset.
+   * By default the number of partitions is the square root of the number of
+   * rows.
+   *
+   * If this value is too large then the first part of the search (picking the
+   * right partition) will be slow.  If this value is too small then the second
+   * part of the search (searching within a partition) will be slow.
+   */
+  numPartitions?: number;
+
+  /** Number of sub-vectors of PQ.
+   *
+   * This value controls how much the vector is compressed during the quantization step.
+   * The more sub vectors there are the less the vector is compressed.  The default is
+   * the dimension of the vector divided by 16.  If the dimension is not evenly divisible
+   * by 16 we use the dimension divded by 8.
+   *
+   * The above two cases are highly preferred.  Having 8 or 16 values per subvector allows
+   * us to use efficient SIMD instructions.
+   *
+   * If the dimension is not visible by 8 then we use 1 subvector.  This is not ideal and
+   * will likely result in poor performance.
+   */
+  numSubVectors?: number;
+
+  /** [DistanceType] to use to build the index.
+   *
+   * Default value is [DistanceType::L2].
+   *
+   * This is used when training the index to calculate the IVF partitions
+   * (vectors are grouped in partitions with similar vectors according to this
+   * distance type) and to calculate a subvector's code during quantization.
+   *
+   * The distance type used to train an index MUST match the distance type used
+   * to search the index.  Failure to do so will yield inaccurate results.
+   *
+   * The following distance types are available:
+   *
+   * "l2" - Euclidean distance. This is a very common distance metric that
+   * accounts for both magnitude and direction when determining the distance
+   * between vectors. L2 distance has a range of [0, ∞).
+   *
+   * "cosine" - Cosine distance.  Cosine distance is a distance metric
+   * calculated from the cosine similarity between two vectors. Cosine
+   * similarity is a measure of similarity between two non-zero vectors of an
+   * inner product space. It is defined to equal the cosine of the angle
+   * between them.  Unlike L2, the cosine distance is not affected by the
+   * magnitude of the vectors.  Cosine distance has a range of [0, 2].
+   *
+   * Note: the cosine distance is undefined when one (or both) of the vectors
+   * are all zeros (there is no direction).  These vectors are invalid and may
+   * never be returned from a vector search.
+   *
+   * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+   * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+   * L2 norm is 1), then dot distance is equivalent to the cosine distance.
+   */
+  distanceType?: "l2" | "cosine" | "dot";
+
+  /** Max iteration to train IVF kmeans.
+   *
+   * When training an IVF PQ index we use kmeans to calculate the partitions.  This parameter
+   * controls how many iterations of kmeans to run.
+   *
+   * Increasing this might improve the quality of the index but in most cases these extra
+   * iterations have diminishing returns.
+   *
+   * The default value is 50.
+   */
+  maxIterations?: number;
+
+  /** The number of vectors, per partition, to sample when training IVF kmeans.
+   *
+   * When an IVF PQ index is trained, we need to calculate partitions.  These are groups
+   * of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+   *
+   * Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+   * random sample of the data.  This parameter controls the size of the sample.  The total
+   * number of vectors used to train the index is `sample_rate * num_partitions`.
+   *
+   * Increasing this value might improve the quality of the index but in most cases the
+   * default should be sufficient.
+   *
+   * The default value is 256.
+   */
+  sampleRate?: number;
+}
+
+export class Index {
+  private readonly inner: LanceDbIndex;
+  private constructor(inner: LanceDbIndex) {
+    this.inner = inner;
+  }
+
+  /**
+   * Create an IvfPq index
+   *
+   * This index stores a compressed (quantized) copy of every vector.  These vectors
+   * are grouped into partitions of similar vectors.  Each partition keeps track of
+   * a centroid which is the average value of all vectors in the group.
+   *
+   * During a query the centroids are compared with the query vector to find the closest
+   * partitions.  The compressed vectors in these partitions are then searched to find
+   * the closest vectors.
+   *
+   * The compression scheme is called product quantization.  Each vector is divided into
+   * subvectors and then each subvector is quantized into a small number of bits.  the
+   * parameters `num_bits` and `num_subvectors` control this process, providing a tradeoff
+   * between index size (and thus search speed) and index accuracy.
+   *
+   * The partitioning process is called IVF and the `num_partitions` parameter controls how
+   * many groups to create.
+   *
+   * Note that training an IVF PQ index on a large dataset is a slow operation and
+   * currently is also a memory intensive operation.
+   */
+  static ivfPq(options?: Partial<IvfPqOptions>) {
+    return new Index(
+      LanceDbIndex.ivfPq(
+        options?.distanceType,
+        options?.numPartitions,
+        options?.numSubVectors,
+        options?.maxIterations,
+        options?.sampleRate,
+      ),
+    );
+  }
+
+  /** Create a btree index
+   *
+   * A btree index is an index on a scalar columns.  The index stores a copy of the column
+   * in sorted order.  A header entry is created for each block of rows (currently the
+   * block size is fixed at 4096).  These header entries are stored in a separate
+   * cacheable structure (a btree).  To search for data the header is used to determine
+   * which blocks need to be read from disk.
+   *
+   * For example, a btree index in a table with 1Bi rows requires sizeof(Scalar) * 256Ki
+   * bytes of memory and will generally need to read sizeof(Scalar) * 4096 bytes to find
+   * the correct row ids.
+   *
+   * This index is good for scalar columns with mostly distinct values and does best when
+   * the query is highly selective.
+   *
+   * The btree index does not currently have any parameters though parameters such as the
+   * block size may be added in the future.
+   */
+  static btree() {
+    return new Index(LanceDbIndex.btree());
+  }
+}
+
+export interface IndexOptions {
+  /** Advanced index configuration
+   *
+   * This option allows you to specify a specfic index to create and also
+   * allows you to pass in configuration for training the index.
+   *
+   * See the static methods on Index for details on the various index types.
+   *
+   * If this is not supplied then column data type(s) and column statistics
+   * will be used to determine the most useful kind of index to create.
+   */
+  config?: Index;
+  /** Whether to replace the existing index
+   *
+   * If this is false, and another index already exists on the same columns
+   * and the same name, then an error will be returned.  This is true even if
+   * that index is out of date.
+   *
+   * The default is true
+   */
+  replace?: boolean;
+}
--- a/nodejs/lancedb/native.d.ts
+++ b/nodejs/lancedb/native.d.ts
@@ -3,15 +3,6 @@

 /* auto-generated by NAPI-RS */

-export const enum IndexType {
-  Scalar = 0,
-  IvfPq = 1
-}
-export const enum MetricType {
-  L2 = 0,
-  Cosine = 1,
-  Dot = 2
-}
 /**
 *  A definition of a column alteration. The alteration changes the column at
 * `path` to have the new name `name`, to be nullable if `nullable` is true,
@@ -93,13 +84,9 @@ export class Connection {
  /** Drop table with the name. Or raise an error if the table does not exist. */
  dropTable(name: string): Promise<void>
 }
-export class IndexBuilder {
-  replace(v: boolean): void
-  column(c: string): void
-  name(name: string): void
-  ivfPq(metricType?: MetricType | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, numBits?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): void
-  scalar(): void
-  build(): Promise<void>
+export class Index {
+  static ivfPq(distanceType?: string | undefined | null, numPartitions?: number | undefined | null, numSubVectors?: number | undefined | null, maxIterations?: number | undefined | null, sampleRate?: number | undefined | null): Index
+  static btree(): Index
 }
 /** Typescript-style Async Iterator over RecordBatches  */
 export class RecordBatchIterator {
@@ -125,7 +112,7 @@ export class Table {
  add(buf: Buffer, mode: string): Promise<void>
  countRows(filter?: string | undefined | null): Promise<number>
  delete(predicate: string): Promise<void>
-  createIndex(): IndexBuilder
+  createIndex(index: Index | undefined | null, column: string, replace?: boolean | undefined | null): Promise<void>
  query(): Query
  addColumns(transforms: Array<AddColumnsSql>): Promise<void>
  alterColumns(alterations: Array<ColumnAlteration>): Promise<void>
--- a/nodejs/lancedb/native.js
+++ b/nodejs/lancedb/native.js
@@ -295,12 +295,10 @@ if (!nativeBinding) {
  throw new Error(`Failed to load native binding`)
 }

-const { Connection, IndexType, MetricType, IndexBuilder, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding
+const { Connection, Index, RecordBatchIterator, Query, Table, WriteMode, connect } = nativeBinding

 module.exports.Connection = Connection
-module.exports.IndexType = IndexType
-module.exports.MetricType = MetricType
-module.exports.IndexBuilder = IndexBuilder
+module.exports.Index = Index
 module.exports.RecordBatchIterator = RecordBatchIterator
 module.exports.Query = Query
 module.exports.Table = Table
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -19,7 +19,7 @@ import {
  Table as _NativeTable,
 } from "./native";
 import { Query } from "./query";
-import { IndexBuilder } from "./indexer";
+import { IndexOptions } from "./indices";
 import { Data, fromDataToBuffer } from "./arrow";

 /**
@@ -103,24 +103,28 @@ export class Table {
    await this.inner.delete(predicate);
  }

-  /** Create an index over the columns.
+  /** Create an index to speed up queries.
   *
-   * @param {string} column The column to create the index on. If not specified,
-   *                        it will create an index on vector field.
+   * Indices can be created on vector columns or scalar columns.
+   * Indices on vector columns will speed up vector searches.
+   * Indices on scalar columns will speed up filtering (in both
+   * vector and non-vector searches)
   *
   * @example
   *
-   * By default, it creates vector idnex on one vector column.
+   * If the column has a vector (fixed size list) data type then
+   * an IvfPq vector index will be created.
   *
   * ```typescript
   * const table = await conn.openTable("my_table");
-   * await table.createIndex().build();
+   * await table.createIndex(["vector"]);
   * ```
   *
-   * You can specify `IVF_PQ` parameters via `ivf_pq({})` call.
+   * For advanced control over vector index creation you can specify
+   * the index type and options.
   * ```typescript
   * const table = await conn.openTable("my_table");
-   * await table.createIndex("my_vec_col")
+   * await table.createIndex(["vector"], I)
   *   .ivf_pq({ num_partitions: 128, num_sub_vectors: 16 })
   *   .build();
   * ```
@@ -131,12 +135,11 @@ export class Table {
   * await table.createIndex("my_float_col").build();
   * ```
   */
-  createIndex(column?: string): IndexBuilder {
-    let builder = new IndexBuilder(this.inner);
-    if (column !== undefined) {
-      builder = builder.column(column);
-    }
-    return builder;
+  async createIndex(column: string, options?: Partial<IndexOptions>) {
+    // Bit of a hack to get around the fact that TS has no package-scope.
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const nativeIndex = (options?.config as any)?.inner;
+    await this.inner.createIndex(nativeIndex, column, options?.replace);
  }

  /**
--- a/nodejs/src/error.rs
+++ b/nodejs/src/error.rs
@@ -0,0 +1,12 @@
+pub type Result<T> = napi::Result<T>;
+
+pub trait NapiErrorExt<T> {
+    /// Convert to a napi error using from_reason(err.to_string())
+    fn default_error(self) -> Result<T>;
+}
+
+impl<T> NapiErrorExt<T> for std::result::Result<T, lancedb::Error> {
+    fn default_error(self) -> Result<T> {
+        self.map_err(|err| napi::Error::from_reason(err.to_string()))
+    }
+}
--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -14,126 +14,73 @@

 use std::sync::Mutex;

-use lance_linalg::distance::MetricType as LanceMetricType;
-use lancedb::index::IndexBuilder as LanceDbIndexBuilder;
-use lancedb::Table as LanceDbTable;
+use lancedb::index::scalar::BTreeIndexBuilder;
+use lancedb::index::vector::IvfPqIndexBuilder;
+use lancedb::index::Index as LanceDbIndex;
+use lancedb::DistanceType;
 use napi_derive::napi;

 #[napi]
-pub enum IndexType {
-    Scalar,
-    IvfPq,
+pub struct Index {
+    inner: Mutex<Option<LanceDbIndex>>,
 }

-#[napi]
-pub enum MetricType {
-    L2,
-    Cosine,
-    Dot,
-}
-
-impl From<MetricType> for LanceMetricType {
-    fn from(metric: MetricType) -> Self {
-        match metric {
-            MetricType::L2 => Self::L2,
-            MetricType::Cosine => Self::Cosine,
-            MetricType::Dot => Self::Dot,
-        }
+impl Index {
+    pub fn consume(&self) -> napi::Result<LanceDbIndex> {
+        self.inner
+            .lock()
+            .unwrap()
+            .take()
+            .ok_or(napi::Error::from_reason(
+                "attempt to use an index more than once",
+            ))
    }
 }

 #[napi]
-pub struct IndexBuilder {
-    inner: Mutex<Option<LanceDbIndexBuilder>>,
-}
-
-impl IndexBuilder {
-    fn modify(
-        &self,
-        mod_fn: impl Fn(LanceDbIndexBuilder) -> LanceDbIndexBuilder,
-    ) -> napi::Result<()> {
-        let mut inner = self.inner.lock().unwrap();
-        let inner_builder = inner.take().ok_or_else(|| {
-            napi::Error::from_reason("IndexBuilder has already been consumed".to_string())
-        })?;
-        let inner_builder = mod_fn(inner_builder);
-        inner.replace(inner_builder);
-        Ok(())
-    }
-}
-
-#[napi]
-impl IndexBuilder {
-    pub fn new(tbl: &LanceDbTable) -> Self {
-        let inner = tbl.create_index(&[]);
-        Self {
-            inner: Mutex::new(Some(inner)),
-        }
-    }
-
-    #[napi]
-    pub fn replace(&self, v: bool) -> napi::Result<()> {
-        self.modify(|b| b.replace(v))
-    }
-
-    #[napi]
-    pub fn column(&self, c: String) -> napi::Result<()> {
-        self.modify(|b| b.columns(&[c.as_str()]))
-    }
-
-    #[napi]
-    pub fn name(&self, name: String) -> napi::Result<()> {
-        self.modify(|b| b.name(name.as_str()))
-    }
-
-    #[napi]
+impl Index {
+    #[napi(factory)]
    pub fn ivf_pq(
-        &self,
-        metric_type: Option<MetricType>,
+        distance_type: Option<String>,
        num_partitions: Option<u32>,
        num_sub_vectors: Option<u32>,
-        num_bits: Option<u32>,
        max_iterations: Option<u32>,
        sample_rate: Option<u32>,
-    ) -> napi::Result<()> {
-        self.modify(|b| {
-            let mut b = b.ivf_pq();
-            if let Some(metric_type) = metric_type {
-                b = b.metric_type(metric_type.into());
-            }
-            if let Some(num_partitions) = num_partitions {
-                b = b.num_partitions(num_partitions);
-            }
-            if let Some(num_sub_vectors) = num_sub_vectors {
-                b = b.num_sub_vectors(num_sub_vectors);
-            }
-            if let Some(num_bits) = num_bits {
-                b = b.num_bits(num_bits);
-            }
-            if let Some(max_iterations) = max_iterations {
-                b = b.max_iterations(max_iterations);
-            }
-            if let Some(sample_rate) = sample_rate {
-                b = b.sample_rate(sample_rate);
-            }
-            b
+    ) -> napi::Result<Self> {
+        let mut ivf_pq_builder = IvfPqIndexBuilder::default();
+        if let Some(distance_type) = distance_type {
+            let distance_type = match distance_type.as_str() {
+                "l2" => Ok(DistanceType::L2),
+                "cosine" => Ok(DistanceType::Cosine),
+                "dot" => Ok(DistanceType::Dot),
+                _ => Err(napi::Error::from_reason(format!(
+                    "Invalid distance type '{}'.  Must be one of l2, cosine, or dot",
+                    distance_type
+                ))),
+            }?;
+            ivf_pq_builder = ivf_pq_builder.distance_type(distance_type);
+        }
+        if let Some(num_partitions) = num_partitions {
+            ivf_pq_builder = ivf_pq_builder.num_partitions(num_partitions);
+        }
+        if let Some(num_sub_vectors) = num_sub_vectors {
+            ivf_pq_builder = ivf_pq_builder.num_sub_vectors(num_sub_vectors);
+        }
+        if let Some(max_iterations) = max_iterations {
+            ivf_pq_builder = ivf_pq_builder.max_iterations(max_iterations);
+        }
+        if let Some(sample_rate) = sample_rate {
+            ivf_pq_builder = ivf_pq_builder.sample_rate(sample_rate);
+        }
+        Ok(Self {
+            inner: Mutex::new(Some(LanceDbIndex::IvfPq(ivf_pq_builder))),
        })
    }

-    #[napi]
-    pub fn scalar(&self) -> napi::Result<()> {
-        self.modify(|b| b.scalar())
-    }
-
-    #[napi]
-    pub async fn build(&self) -> napi::Result<()> {
-        let inner = self.inner.lock().unwrap().take().ok_or_else(|| {
-            napi::Error::from_reason("IndexBuilder has already been consumed".to_string())
-        })?;
-        inner
-            .build()
-            .await
-            .map_err(|e| napi::Error::from_reason(format!("Failed to build index: {}", e)))?;
-        Ok(())
+    #[napi(factory)]
+    pub fn btree() -> Self {
+        Self {
+            inner: Mutex::new(Some(LanceDbIndex::BTree(BTreeIndexBuilder::default()))),
+        }
    }
 }
--- a/nodejs/src/iterator.rs
+++ b/nodejs/src/iterator.rs
@@ -13,7 +13,7 @@
 // limitations under the License.

 use futures::StreamExt;
-use lance::io::RecordBatchStream;
+use lancedb::arrow::SendableRecordBatchStream;
 use lancedb::ipc::batches_to_ipc_file;
 use napi::bindgen_prelude::*;
 use napi_derive::napi;
@@ -21,12 +21,12 @@ use napi_derive::napi;
 /** Typescript-style Async Iterator over RecordBatches */
 #[napi]
 pub struct RecordBatchIterator {
-    inner: Box<dyn RecordBatchStream + Unpin>,
+    inner: SendableRecordBatchStream,
 }

 #[napi]
 impl RecordBatchIterator {
-    pub(crate) fn new(inner: Box<dyn RecordBatchStream + Unpin>) -> Self {
+    pub(crate) fn new(inner: SendableRecordBatchStream) -> Self {
        Self { inner }
    }

--- a/nodejs/src/lib.rs
+++ b/nodejs/src/lib.rs
@@ -16,6 +16,7 @@ use connection::Connection;
 use napi_derive::*;

 mod connection;
+mod error;
 mod index;
 mod iterator;
 mod query;
--- a/nodejs/src/query.rs
+++ b/nodejs/src/query.rs
@@ -74,6 +74,6 @@ impl Query {
        let inner_stream = self.inner.execute_stream().await.map_err(|e| {
            napi::Error::from_reason(format!("Failed to execute query stream: {}", e))
        })?;
-        Ok(RecordBatchIterator::new(Box::new(inner_stream)))
+        Ok(RecordBatchIterator::new(inner_stream))
    }
 }
--- a/nodejs/src/table.rs
+++ b/nodejs/src/table.rs
@@ -13,13 +13,16 @@
 // limitations under the License.

 use arrow_ipc::writer::FileWriter;
-use lance::dataset::ColumnAlteration as LanceColumnAlteration;
 use lancedb::ipc::ipc_file_to_batches;
-use lancedb::table::{AddDataMode, Table as LanceDbTable};
+use lancedb::table::{
+    AddDataMode, ColumnAlteration as LanceColumnAlteration, NewColumnTransform,
+    Table as LanceDbTable,
+};
 use napi::bindgen_prelude::*;
 use napi_derive::napi;

-use crate::index::IndexBuilder;
+use crate::error::NapiErrorExt;
+use crate::index::Index;
 use crate::query::Query;

 #[napi]
@@ -129,8 +132,22 @@ impl Table {
    }

    #[napi]
-    pub fn create_index(&self) -> napi::Result<IndexBuilder> {
-        Ok(IndexBuilder::new(self.inner_ref()?))
+    pub async fn create_index(
+        &self,
+        index: Option<&Index>,
+        column: String,
+        replace: Option<bool>,
+    ) -> napi::Result<()> {
+        let lancedb_index = if let Some(index) = index {
+            index.consume()?
+        } else {
+            lancedb::index::Index::Auto
+        };
+        let mut builder = self.inner_ref()?.create_index(&[column], lancedb_index);
+        if let Some(replace) = replace {
+            builder = builder.replace(replace);
+        }
+        builder.execute().await.default_error()
    }

    #[napi]
@@ -144,7 +161,7 @@ impl Table {
            .into_iter()
            .map(|sql| (sql.name, sql.value_sql))
            .collect::<Vec<_>>();
-        let transforms = lance::dataset::NewColumnTransform::SqlExpressions(transforms);
+        let transforms = NewColumnTransform::SqlExpressions(transforms);
        self.inner_ref()?
            .add_columns(transforms, None)
            .await