feat: support binary vector and IVF_FLAT in TypeScript (#2221)

resolve #2218 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-01-08 12:52:58 +00:00 · 2025-03-22 01:57:08 +08:00
parent 2bfdef2624
commit bdb6c09c3b
11 changed files with 406 additions and 5 deletions
--- a/nodejs/test/table.test.ts
+++ b/nodejs/test/table.test.ts
@@ -21,6 +21,7 @@ import {
  Int64,
  List,
  Schema,
+  Uint8,
  Utf8,
  makeArrowTable,
 } from "../lancedb/arrow";
@@ -740,6 +741,38 @@ describe("When creating an index", () => {
    expect(stats).toBeUndefined();
  });

+  test("create ivf_flat with binary vectors", async () => {
+    const db = await connect(tmpDir.name);
+    const binarySchema = new Schema([
+      new Field("id", new Int32(), true),
+      new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
+    ]);
+    const tbl = await db.createTable(
+      "binary",
+      makeArrowTable(
+        Array(300)
+          .fill(1)
+          .map((_, i) => ({
+            id: i,
+            vec: Array(32)
+              .fill(1)
+              .map(() => Math.floor(Math.random() * 255)),
+          })),
+        { schema: binarySchema },
+      ),
+    );
+    await tbl.createIndex("vec", {
+      config: Index.ivfFlat({ numPartitions: 10, distanceType: "hamming" }),
+    });
+
+    // query with binary vectors
+    const queryVec = Array(32)
+      .fill(1)
+      .map(() => Math.floor(Math.random() * 255));
+    const rst = await tbl.query().limit(5).nearestTo(queryVec).toArrow();
+    expect(rst.numRows).toBe(5);
+  });
+
  // TODO: Move this test to the query API test (making sure we can reject queries
  // when the dimension is incorrect)
  test("two columns with different dimensions", async () => {
--- a/nodejs/examples/search.test.ts
+++ b/nodejs/examples/search.test.ts
@@ -4,9 +4,12 @@ import { expect, test } from "@jest/globals";
 // --8<-- [start:import]
 import * as lancedb from "@lancedb/lancedb";
 // --8<-- [end:import]
+// --8<-- [start:import_bin_util]
+import { Field, FixedSizeList, Int32, Schema, Uint8 } from "apache-arrow";
+// --8<-- [end:import_bin_util]
 import { withTempDirectory } from "./util.ts";

-test("full text search", async () => {
+test("vector search", async () => {
  await withTempDirectory(async (databaseDir) => {
    {
      const db = await lancedb.connect(databaseDir);
@@ -14,8 +17,6 @@ test("full text search", async () => {
      const data = Array.from({ length: 10_000 }, (_, i) => ({
        vector: Array(128).fill(i),
        id: `${i}`,
-        content: "",
-        longId: `${i}`,
      }));

      await db.createTable("my_vectors", data);
@@ -52,5 +53,41 @@ test("full text search", async () => {
      expect(r.distance).toBeGreaterThanOrEqual(0.1);
      expect(r.distance).toBeLessThan(0.2);
    }
+
+    {
+      // --8<-- [start:ingest_binary_data]
+      const schema = new Schema([
+        new Field("id", new Int32(), true),
+        new Field("vec", new FixedSizeList(32, new Field("item", new Uint8()))),
+      ]);
+      const data = lancedb.makeArrowTable(
+        Array(1_000)
+          .fill(0)
+          .map((_, i) => ({
+            // the 256 bits would be store in 32 bytes,
+            // if your data is already in this format, you can skip the packBits step
+            id: i,
+            vec: lancedb.packBits(Array(256).fill(i % 2)),
+          })),
+        { schema: schema },
+      );
+
+      const tbl = await db.createTable("binary_table", data);
+      await tbl.createIndex("vec", {
+        config: lancedb.Index.ivfFlat({
+          numPartitions: 10,
+          distanceType: "hamming",
+        }),
+      });
+      // --8<-- [end:ingest_binary_data]
+
+      // --8<-- [start:search_binary_data]
+      const query = Array(32)
+        .fill(1)
+        .map(() => Math.floor(Math.random() * 255));
+      const results = await tbl.query().nearestTo(query).limit(10).toArrow();
+      // --8<-- [end:search_binary_data
+      expect(results.numRows).toBe(10);
+    }
  });
 });
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -53,6 +53,7 @@ export {
  Index,
  IndexOptions,
  IvfPqOptions,
+  IvfFlatOptions,
  HnswPqOptions,
  HnswSqOptions,
  FtsOptions,
@@ -79,7 +80,7 @@ export {
  DataLike,
  IntoVector,
 } from "./arrow";
-export { IntoSql } from "./util";
+export { IntoSql, packBits } from "./util";

 /**
 * Connect to a LanceDB instance at the given URI.
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -327,6 +327,94 @@ export interface HnswSqOptions {
  efConstruction?: number;
 }

+/**
+ * Options to create an `IVF_FLAT` index
+ */
+export interface IvfFlatOptions {
+  /**
+   * The number of IVF partitions to create.
+   *
+   * This value should generally scale with the number of rows in the dataset.
+   * By default the number of partitions is the square root of the number of
+   * rows.
+   *
+   * If this value is too large then the first part of the search (picking the
+   * right partition) will be slow.  If this value is too small then the second
+   * part of the search (searching within a partition) will be slow.
+   */
+  numPartitions?: number;
+
+  /**
+   * Distance type to use to build the index.
+   *
+   * Default value is "l2".
+   *
+   * This is used when training the index to calculate the IVF partitions
+   * (vectors are grouped in partitions with similar vectors according to this
+   * distance type).
+   *
+   * The distance type used to train an index MUST match the distance type used
+   * to search the index.  Failure to do so will yield inaccurate results.
+   *
+   * The following distance types are available:
+   *
+   * "l2" - Euclidean distance. This is a very common distance metric that
+   * accounts for both magnitude and direction when determining the distance
+   * between vectors. l2 distance has a range of [0, ∞).
+   *
+   * "cosine" - Cosine distance.  Cosine distance is a distance metric
+   * calculated from the cosine similarity between two vectors. Cosine
+   * similarity is a measure of similarity between two non-zero vectors of an
+   * inner product space. It is defined to equal the cosine of the angle
+   * between them.  Unlike l2, the cosine distance is not affected by the
+   * magnitude of the vectors.  Cosine distance has a range of [0, 2].
+   *
+   * Note: the cosine distance is undefined when one (or both) of the vectors
+   * are all zeros (there is no direction).  These vectors are invalid and may
+   * never be returned from a vector search.
+   *
+   * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+   * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+   * l2 norm is 1), then dot distance is equivalent to the cosine distance.
+   *
+   * "hamming" - Hamming distance. Hamming distance is a distance metric
+   * calculated from the number of bits that are different between two vectors.
+   * Hamming distance has a range of [0, dimension]. Note that the hamming distance
+   * is only valid for binary vectors.
+   */
+  distanceType?: "l2" | "cosine" | "dot" | "hamming";
+
+  /**
+   * Max iteration to train IVF kmeans.
+   *
+   * When training an IVF FLAT index we use kmeans to calculate the partitions.  This parameter
+   * controls how many iterations of kmeans to run.
+   *
+   * Increasing this might improve the quality of the index but in most cases these extra
+   * iterations have diminishing returns.
+   *
+   * The default value is 50.
+   */
+  maxIterations?: number;
+
+  /**
+   * The number of vectors, per partition, to sample when training IVF kmeans.
+   *
+   * When an IVF FLAT index is trained, we need to calculate partitions.  These are groups
+   * of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+   *
+   * Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+   * random sample of the data.  This parameter controls the size of the sample.  The total
+   * number of vectors used to train the index is `sample_rate * num_partitions`.
+   *
+   * Increasing this value might improve the quality of the index but in most cases the
+   * default should be sufficient.
+   *
+   * The default value is 256.
+   */
+  sampleRate?: number;
+}
+
 /**
 * Options to create a full text search index
 */
@@ -426,6 +514,33 @@ export class Index {
    );
  }

+  /**
+   * Create an IvfFlat index
+   *
+   * This index groups vectors into partitions of similar vectors.  Each partition keeps track of
+   * a centroid which is the average value of all vectors in the group.
+   *
+   * During a query the centroids are compared with the query vector to find the closest
+   * partitions.  The vectors in these partitions are then searched to find
+   * the closest vectors.
+   *
+   * The partitioning process is called IVF and the `num_partitions` parameter controls how
+   * many groups to create.
+   *
+   * Note that training an IVF FLAT index on a large dataset is a slow operation and
+   * currently is also a memory intensive operation.
+   */
+  static ivfFlat(options?: Partial<IvfFlatOptions>) {
+    return new Index(
+      LanceDbIndex.ivfFlat(
+        options?.distanceType,
+        options?.numPartitions,
+        options?.maxIterations,
+        options?.sampleRate,
+      ),
+    );
+  }
+
  /**
   * Create a btree index
   *
--- a/nodejs/lancedb/util.ts
+++ b/nodejs/lancedb/util.ts
@@ -35,6 +35,16 @@ export function toSQL(value: IntoSql): string {
  }
 }

+export function packBits(data: Array<number>): Array<number> {
+  const packed = Array(data.length >> 3).fill(0);
+  for (let i = 0; i < data.length; i++) {
+    const byte = i >> 3;
+    const bit = i & 7;
+    packed[byte] |= data[i] << bit;
+  }
+  return packed;
+}
+
 export class TTLCache {
  // biome-ignore lint/suspicious/noExplicitAny: <explanation>
  private readonly cache: Map<string, { value: any; expires: number }>;
--- a/nodejs/src/index.rs
+++ b/nodejs/src/index.rs
@@ -4,7 +4,9 @@
 use std::sync::Mutex;

 use lancedb::index::scalar::{BTreeIndexBuilder, FtsIndexBuilder};
-use lancedb::index::vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder};
+use lancedb::index::vector::{
+    IvfFlatIndexBuilder, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder,
+};
 use lancedb::index::Index as LanceDbIndex;
 use napi_derive::napi;

@@ -63,6 +65,32 @@ impl Index {
        })
    }

+    #[napi(factory)]
+    pub fn ivf_flat(
+        distance_type: Option<String>,
+        num_partitions: Option<u32>,
+        max_iterations: Option<u32>,
+        sample_rate: Option<u32>,
+    ) -> napi::Result<Self> {
+        let mut ivf_flat_builder = IvfFlatIndexBuilder::default();
+        if let Some(distance_type) = distance_type {
+            let distance_type = parse_distance_type(distance_type)?;
+            ivf_flat_builder = ivf_flat_builder.distance_type(distance_type);
+        }
+        if let Some(num_partitions) = num_partitions {
+            ivf_flat_builder = ivf_flat_builder.num_partitions(num_partitions);
+        }
+        if let Some(max_iterations) = max_iterations {
+            ivf_flat_builder = ivf_flat_builder.max_iterations(max_iterations);
+        }
+        if let Some(sample_rate) = sample_rate {
+            ivf_flat_builder = ivf_flat_builder.sample_rate(sample_rate);
+        }
+        Ok(Self {
+            inner: Mutex::new(Some(LanceDbIndex::IvfFlat(ivf_flat_builder))),
+        })
+    }
+
    #[napi(factory)]
    pub fn btree() -> Self {
        Self {