feat: support binary vector and IVF_FLAT in TypeScript (#2221)

resolve #2218 --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2026-01-10 13:52:58 +00:00 · 2025-03-22 01:57:08 +08:00
parent 2bfdef2624
commit bdb6c09c3b
11 changed files with 406 additions and 5 deletions
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -53,6 +53,7 @@ export {
  Index,
  IndexOptions,
  IvfPqOptions,
+  IvfFlatOptions,
  HnswPqOptions,
  HnswSqOptions,
  FtsOptions,
@@ -79,7 +80,7 @@ export {
  DataLike,
  IntoVector,
 } from "./arrow";
-export { IntoSql } from "./util";
+export { IntoSql, packBits } from "./util";

 /**
 * Connect to a LanceDB instance at the given URI.
--- a/nodejs/lancedb/indices.ts
+++ b/nodejs/lancedb/indices.ts
@@ -327,6 +327,94 @@ export interface HnswSqOptions {
  efConstruction?: number;
 }

+/**
+ * Options to create an `IVF_FLAT` index
+ */
+export interface IvfFlatOptions {
+  /**
+   * The number of IVF partitions to create.
+   *
+   * This value should generally scale with the number of rows in the dataset.
+   * By default the number of partitions is the square root of the number of
+   * rows.
+   *
+   * If this value is too large then the first part of the search (picking the
+   * right partition) will be slow.  If this value is too small then the second
+   * part of the search (searching within a partition) will be slow.
+   */
+  numPartitions?: number;
+
+  /**
+   * Distance type to use to build the index.
+   *
+   * Default value is "l2".
+   *
+   * This is used when training the index to calculate the IVF partitions
+   * (vectors are grouped in partitions with similar vectors according to this
+   * distance type).
+   *
+   * The distance type used to train an index MUST match the distance type used
+   * to search the index.  Failure to do so will yield inaccurate results.
+   *
+   * The following distance types are available:
+   *
+   * "l2" - Euclidean distance. This is a very common distance metric that
+   * accounts for both magnitude and direction when determining the distance
+   * between vectors. l2 distance has a range of [0, ∞).
+   *
+   * "cosine" - Cosine distance.  Cosine distance is a distance metric
+   * calculated from the cosine similarity between two vectors. Cosine
+   * similarity is a measure of similarity between two non-zero vectors of an
+   * inner product space. It is defined to equal the cosine of the angle
+   * between them.  Unlike l2, the cosine distance is not affected by the
+   * magnitude of the vectors.  Cosine distance has a range of [0, 2].
+   *
+   * Note: the cosine distance is undefined when one (or both) of the vectors
+   * are all zeros (there is no direction).  These vectors are invalid and may
+   * never be returned from a vector search.
+   *
+   * "dot" - Dot product. Dot distance is the dot product of two vectors. Dot
+   * distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
+   * l2 norm is 1), then dot distance is equivalent to the cosine distance.
+   *
+   * "hamming" - Hamming distance. Hamming distance is a distance metric
+   * calculated from the number of bits that are different between two vectors.
+   * Hamming distance has a range of [0, dimension]. Note that the hamming distance
+   * is only valid for binary vectors.
+   */
+  distanceType?: "l2" | "cosine" | "dot" | "hamming";
+
+  /**
+   * Max iteration to train IVF kmeans.
+   *
+   * When training an IVF FLAT index we use kmeans to calculate the partitions.  This parameter
+   * controls how many iterations of kmeans to run.
+   *
+   * Increasing this might improve the quality of the index but in most cases these extra
+   * iterations have diminishing returns.
+   *
+   * The default value is 50.
+   */
+  maxIterations?: number;
+
+  /**
+   * The number of vectors, per partition, to sample when training IVF kmeans.
+   *
+   * When an IVF FLAT index is trained, we need to calculate partitions.  These are groups
+   * of vectors that are similar to each other.  To do this we use an algorithm called kmeans.
+   *
+   * Running kmeans on a large dataset can be slow.  To speed this up we run kmeans on a
+   * random sample of the data.  This parameter controls the size of the sample.  The total
+   * number of vectors used to train the index is `sample_rate * num_partitions`.
+   *
+   * Increasing this value might improve the quality of the index but in most cases the
+   * default should be sufficient.
+   *
+   * The default value is 256.
+   */
+  sampleRate?: number;
+}
+
 /**
 * Options to create a full text search index
 */
@@ -426,6 +514,33 @@ export class Index {
    );
  }

+  /**
+   * Create an IvfFlat index
+   *
+   * This index groups vectors into partitions of similar vectors.  Each partition keeps track of
+   * a centroid which is the average value of all vectors in the group.
+   *
+   * During a query the centroids are compared with the query vector to find the closest
+   * partitions.  The vectors in these partitions are then searched to find
+   * the closest vectors.
+   *
+   * The partitioning process is called IVF and the `num_partitions` parameter controls how
+   * many groups to create.
+   *
+   * Note that training an IVF FLAT index on a large dataset is a slow operation and
+   * currently is also a memory intensive operation.
+   */
+  static ivfFlat(options?: Partial<IvfFlatOptions>) {
+    return new Index(
+      LanceDbIndex.ivfFlat(
+        options?.distanceType,
+        options?.numPartitions,
+        options?.maxIterations,
+        options?.sampleRate,
+      ),
+    );
+  }
+
  /**
   * Create a btree index
   *
--- a/nodejs/lancedb/util.ts
+++ b/nodejs/lancedb/util.ts
@@ -35,6 +35,16 @@ export function toSQL(value: IntoSql): string {
  }
 }

+export function packBits(data: Array<number>): Array<number> {
+  const packed = Array(data.length >> 3).fill(0);
+  for (let i = 0; i < data.length; i++) {
+    const byte = i >> 3;
+    const bit = i & 7;
+    packed[byte] |= data[i] << bit;
+  }
+  return packed;
+}
+
 export class TTLCache {
  // biome-ignore lint/suspicious/noExplicitAny: <explanation>
  private readonly cache: Map<string, { value: any; expires: number }>;