From 96c66fd0878165bcef39dc5f98d2a70a53030897 Mon Sep 17 00:00:00 2001
From: BubbleCal <bubble-cal@outlook.com>
Date: Tue, 22 Jul 2025 21:19:34 +0800
Subject: [PATCH] feat: support multivector for JS SDK (#2527)

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
---
 docs/package-lock.json                    | 12 ++---
 docs/src/guides/sql_querying.md           | 30 ++++-------
 docs/src/js/classes/MatchQuery.md         |  3 ++
 docs/src/js/classes/Table.md              |  4 +-
 docs/src/js/classes/VectorQuery.md        | 51 ++++++++++++++++++
 docs/src/js/enumerations/Occur.md         |  9 ++++
 docs/src/js/globals.md                    |  1 +
 docs/src/js/interfaces/FtsOptions.md      | 32 ++++++++++-
 docs/src/js/interfaces/OptimizeOptions.md |  4 +-
 docs/src/js/type-aliases/MultiVector.md   | 11 ++++
 docs/test/md_testing.py                   |  3 +-
 node/package-lock.json                    | 65 +++++++++++++++++++++++
 nodejs/__test__/table.test.ts             | 39 ++++++++++++++
 nodejs/lancedb/arrow.ts                   | 14 +++++
 nodejs/lancedb/index.ts                   |  1 +
 nodejs/lancedb/table.ts                   | 18 +++++--
 16 files changed, 262 insertions(+), 35 deletions(-)
 create mode 100644 docs/src/js/type-aliases/MultiVector.md

diff --git a/docs/package-lock.json b/docs/package-lock.json
index 1baad851..e87f3e0e 100644
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@@ -19,7 +19,7 @@
     },
     "../node": {
       "name": "vectordb",
-      "version": "0.12.0",
+      "version": "0.21.2-beta.0",
       "cpu": [
         "x64",
         "arm64"
@@ -65,11 +65,11 @@
         "uuid": "^9.0.0"
       },
       "optionalDependencies": {
-        "@lancedb/vectordb-darwin-arm64": "0.12.0",
-        "@lancedb/vectordb-darwin-x64": "0.12.0",
-        "@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
-        "@lancedb/vectordb-linux-x64-gnu": "0.12.0",
-        "@lancedb/vectordb-win32-x64-msvc": "0.12.0"
+        "@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
+        "@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
+        "@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
+        "@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
+        "@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
       },
       "peerDependencies": {
         "@apache-arrow/ts": "^14.0.2",
diff --git a/docs/src/guides/sql_querying.md b/docs/src/guides/sql_querying.md
index 27cfa79a..30ca2ffe 100644
--- a/docs/src/guides/sql_querying.md
+++ b/docs/src/guides/sql_querying.md
@@ -1,7 +1,9 @@
+# SQL Querying
+
 You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
 This guide will show how to query Lance tables them using both.
 
-We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
+We will re-use the dataset [created previously](./tables.md):
 
 ```python
 import lancedb
@@ -27,15 +29,10 @@ arrow_table = table.to_lance()
 duckdb.query("SELECT * FROM arrow_table")
 ```
 
-```
-┌─────────────┬─────────┬────────┐
-│   vector    │  item   │ price  │
-│   float[]   │ varchar │ double │
-├─────────────┼─────────┼────────┤
-│ [3.1, 4.1]  │ foo     │   10.0 │
-│ [5.9, 26.5] │ bar     │   20.0 │
-└─────────────┴─────────┴────────┘
-```
+| vector      | item | price |
+| ----------- | ---- | ----- |
+| [3.1, 4.1]  | foo  | 10.0  |
+| [5.9, 26.5] | bar  | 20.0  |
 
 ## Querying a LanceDB Table with Apache Datafusion
 
@@ -57,12 +54,7 @@ Register the table created with the Datafusion session context.
     --8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
     ```
 
-```
-┌─────────────┬─────────┬────────┐
-│   vector    │  item   │ price  │
-│   float[]   │ varchar │ double │
-├─────────────┼─────────┼────────┤
-│ [3.1, 4.1]  │ foo     │   10.0 │
-│ [5.9, 26.5] │ bar     │   20.0 │
-└─────────────┴─────────┴────────┘
-```
+| vector      | item | price |
+| ----------- | ---- | ----- |
+| [3.1, 4.1]  | foo  | 10.0  |
+| [5.9, 26.5] | bar  | 20.0  |
diff --git a/docs/src/js/classes/MatchQuery.md b/docs/src/js/classes/MatchQuery.md
index e69e47ca..61fd434a 100644
--- a/docs/src/js/classes/MatchQuery.md
+++ b/docs/src/js/classes/MatchQuery.md
@@ -41,6 +41,7 @@ Creates an instance of MatchQuery.
     - `fuzziness`: The fuzziness level for the query (default is 0).
     - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
     - `operator`: The logical operator to use for combining terms in the query (default is "OR").
+    - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
 
 * **options.boost?**: `number`
 
@@ -50,6 +51,8 @@ Creates an instance of MatchQuery.
 
 * **options.operator?**: [`Operator`](../enumerations/Operator.md)
 
+* **options.prefixLength?**: `number`
+
 #### Returns
 
 [`MatchQuery`](MatchQuery.md)
diff --git a/docs/src/js/classes/Table.md b/docs/src/js/classes/Table.md
index 0bad38a5..23fd8b38 100644
--- a/docs/src/js/classes/Table.md
+++ b/docs/src/js/classes/Table.md
@@ -612,7 +612,7 @@ of the given query
 
 #### Parameters
 
-* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
+* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
     the query, a vector or string
 
 * **queryType?**: `string`
@@ -799,7 +799,7 @@ by `query`.
 
 #### Parameters
 
-* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
+* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
 
 #### Returns
 
diff --git a/docs/src/js/classes/VectorQuery.md b/docs/src/js/classes/VectorQuery.md
index 568ded42..66339774 100644
--- a/docs/src/js/classes/VectorQuery.md
+++ b/docs/src/js/classes/VectorQuery.md
@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.
 
 ***
 
+### maximumNprobes()
+
+```ts
+maximumNprobes(maximumNprobes): VectorQuery
+```
+
+Set the maximum number of probes used.
+
+This controls the maximum number of partitions that will be searched.  If this
+number is greater than minimumNprobes then the excess partitions will _only_ be
+searched if we have not found enough results.  This can be useful when there is
+a narrow filter to allow these queries to spend more time searching and avoid
+potential false negatives.
+
+#### Parameters
+
+* **maximumNprobes**: `number`
+
+#### Returns
+
+[`VectorQuery`](VectorQuery.md)
+
+***
+
+### minimumNprobes()
+
+```ts
+minimumNprobes(minimumNprobes): VectorQuery
+```
+
+Set the minimum number of probes used.
+
+This controls the minimum number of partitions that will be searched.  This
+parameter will impact every query against a vector index, regardless of the
+filter.  See `nprobes` for more details.  Higher values will increase recall
+but will also increase latency.
+
+#### Parameters
+
+* **minimumNprobes**: `number`
+
+#### Returns
+
+[`VectorQuery`](VectorQuery.md)
+
+***
+
 ### nprobes()
 
 ```ts
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
 your actual data to find the smallest possible value that will still give
 you the desired recall.
 
+For more fine grained control over behavior when you have a very narrow filter
+you can use `minimumNprobes` and `maximumNprobes`.  This method sets both
+the minimum and maximum to the same value.
+
 #### Parameters
 
 * **nprobes**: `number`
diff --git a/docs/src/js/enumerations/Occur.md b/docs/src/js/enumerations/Occur.md
index 5e84958b..506727a1 100644
--- a/docs/src/js/enumerations/Occur.md
+++ b/docs/src/js/enumerations/Occur.md
@@ -10,6 +10,7 @@ Enum representing the occurrence of terms in full-text queries.
 
 - `Must`: The term must be present in the document.
 - `Should`: The term should contribute to the document score, but is not required.
+- `MustNot`: The term must not be present in the document.
 
 ## Enumeration Members
 
@@ -21,6 +22,14 @@ Must: "MUST";
 
 ***
 
+### MustNot
+
+```ts
+MustNot: "MUST_NOT";
+```
+
+***
+
 ### Should
 
 ```ts
diff --git a/docs/src/js/globals.md b/docs/src/js/globals.md
index caf73804..e7e6beac 100644
--- a/docs/src/js/globals.md
+++ b/docs/src/js/globals.md
@@ -84,6 +84,7 @@
 - [FieldLike](type-aliases/FieldLike.md)
 - [IntoSql](type-aliases/IntoSql.md)
 - [IntoVector](type-aliases/IntoVector.md)
+- [MultiVector](type-aliases/MultiVector.md)
 - [RecordBatchLike](type-aliases/RecordBatchLike.md)
 - [SchemaLike](type-aliases/SchemaLike.md)
 - [TableLike](type-aliases/TableLike.md)
diff --git a/docs/src/js/interfaces/FtsOptions.md b/docs/src/js/interfaces/FtsOptions.md
index af774cb1..0e982216 100644
--- a/docs/src/js/interfaces/FtsOptions.md
+++ b/docs/src/js/interfaces/FtsOptions.md
@@ -23,7 +23,7 @@ whether to remove punctuation
 ### baseTokenizer?
 
 ```ts
-optional baseTokenizer: "raw" | "simple" | "whitespace";
+optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
 ```
 
 The tokenizer to use when building the index.
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored
 
 ***
 
+### ngramMaxLength?
+
+```ts
+optional ngramMaxLength: number;
+```
+
+ngram max length
+
+***
+
+### ngramMinLength?
+
+```ts
+optional ngramMinLength: number;
+```
+
+ngram min length
+
+***
+
+### prefixOnly?
+
+```ts
+optional prefixOnly: boolean;
+```
+
+whether to only index the prefix of the token for ngram tokenizer
+
+***
+
 ### removeStopWords?
 
 ```ts
diff --git a/docs/src/js/interfaces/OptimizeOptions.md b/docs/src/js/interfaces/OptimizeOptions.md
index 651835a0..e2897970 100644
--- a/docs/src/js/interfaces/OptimizeOptions.md
+++ b/docs/src/js/interfaces/OptimizeOptions.md
@@ -24,10 +24,10 @@ The default is 7 days
 // Delete all versions older than 1 day
 const olderThan = new Date();
 olderThan.setDate(olderThan.getDate() - 1));
-tbl.cleanupOlderVersions(olderThan);
+tbl.optimize({cleanupOlderThan: olderThan});
 
 // Delete all versions except the current version
-tbl.cleanupOlderVersions(new Date());
+tbl.optimize({cleanupOlderThan: new Date()});
 ```
 
 ***
diff --git a/docs/src/js/type-aliases/MultiVector.md b/docs/src/js/type-aliases/MultiVector.md
new file mode 100644
index 00000000..760f4f8b
--- /dev/null
+++ b/docs/src/js/type-aliases/MultiVector.md
@@ -0,0 +1,11 @@
+[**@lancedb/lancedb**](../README.md) • **Docs**
+
+***
+
+[@lancedb/lancedb](../globals.md) / MultiVector
+
+# Type Alias: MultiVector
+
+```ts
+type MultiVector: IntoVector[];
+```
diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py
index 8db130c1..0bd38076 100755
--- a/docs/test/md_testing.py
+++ b/docs/test/md_testing.py
@@ -30,7 +30,8 @@ excluded_globs = [
     "../src/rag/advanced_techniques/*.md",
     "../src/guides/scalar_index.md",
     "../src/guides/storage.md",
-    "../src/search.md"
+    "../src/search.md",
+    "../src/guides/sql_querying.md",
 ]
 
 python_prefix = "py"
diff --git a/node/package-lock.json b/node/package-lock.json
index 6da63ac7..36cde48b 100644
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -326,6 +326,71 @@
         "@jridgewell/sourcemap-codec": "^1.4.10"
       }
     },
+    "node_modules/@lancedb/vectordb-darwin-arm64": {
+      "version": "0.21.2-beta.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.0.tgz",
+      "integrity": "sha512-RiYqpKuq9v8A4wFuHt1iPNFYjWJ1KgGFLJwQO4ajp9Hee84sDHq8mP0ATgMcc24hiaOUQ1lRRTULjGbHn4NIYw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@lancedb/vectordb-darwin-x64": {
+      "version": "0.21.2-beta.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.0.tgz",
+      "integrity": "sha512-togdP0YIjMYg/hBRMMxW434i5VB789JWU5o3hWrodbX8olEc0Txqw5Dg9CgIOldBIiCti6uTSQiTo6uldZon1w==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
+      "version": "0.21.2-beta.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.0.tgz",
+      "integrity": "sha512-ErS4IQDQVTYVATPeOj/dZXQR34eZQ5rAXm3vJdQi5K6X4zCDaIjOhpmnwzPBGT9W1idaBAoDJhtNfsFaJ6/PQQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@lancedb/vectordb-linux-x64-gnu": {
+      "version": "0.21.2-beta.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.0.tgz",
+      "integrity": "sha512-ycDpyBGbfxtnGGa/RQo5+So6dHALiem1pbYc/LDKKluUJpadtXtEwC61o6hZTcejoYjhEE8ET7vA3OCEJfMFaw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@lancedb/vectordb-win32-x64-msvc": {
+      "version": "0.21.2-beta.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.0.tgz",
+      "integrity": "sha512-IgVkAP/LiNIQD5P6n/9x3bgQOt5pGJarjtSF8r+ialD95QHmo6tcxrwTy/DlA+H1uI6B6h+sbN0c1KXTh1rYcg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
     "node_modules/@neon-rs/cli": {
       "version": "0.0.160",
       "resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts
index 9d49f243..d26a04f4 100644
--- a/nodejs/__test__/table.test.ts
+++ b/nodejs/__test__/table.test.ts
@@ -1863,4 +1863,43 @@ describe("column name options", () => {
     expect(results[0].query_index).toBe(0);
     expect(results[1].query_index).toBe(1);
   });
+
+  test("index and search multivectors", async () => {
+    const db = await connect(tmpDir.name);
+    const data = [];
+    // generate 512 random multivectors
+    for (let i = 0; i < 256; i++) {
+      data.push({
+        multivector: Array.from({ length: 10 }, () =>
+          Array(2).fill(Math.random()),
+        ),
+      });
+    }
+    const table = await db.createTable("multivectors", data, {
+      schema: new Schema([
+        new Field(
+          "multivector",
+          new List(
+            new Field(
+              "item",
+              new FixedSizeList(2, new Field("item", new Float32())),
+            ),
+          ),
+        ),
+      ]),
+    });
+
+    const results = await table.search(data[0].multivector).limit(10).toArray();
+    expect(results.length).toBe(10);
+
+    await table.createIndex("multivector", {
+      config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
+    });
+
+    const results2 = await table
+      .search(data[0].multivector)
+      .limit(10)
+      .toArray();
+    expect(results2.length).toBe(10);
+  });
 });
diff --git a/nodejs/lancedb/arrow.ts b/nodejs/lancedb/arrow.ts
index 852e14df..7bf2bb40 100644
--- a/nodejs/lancedb/arrow.ts
+++ b/nodejs/lancedb/arrow.ts
@@ -107,6 +107,20 @@ export type IntoVector =
   | number[]
   | Promise<Float32Array | Float64Array | number[]>;
 
+export type MultiVector = IntoVector[];
+
+export function isMultiVector(value: unknown): value is MultiVector {
+  return Array.isArray(value) && isIntoVector(value[0]);
+}
+
+export function isIntoVector(value: unknown): value is IntoVector {
+  return (
+    value instanceof Float32Array ||
+    value instanceof Float64Array ||
+    (Array.isArray(value) && !Array.isArray(value[0]))
+  );
+}
+
 export function isArrowTable(value: object): value is TableLike {
   if (value instanceof ArrowTable) return true;
   return "schema" in value && "batches" in value;
diff --git a/nodejs/lancedb/index.ts b/nodejs/lancedb/index.ts
index 0750a48c..0bf4f9f5 100644
--- a/nodejs/lancedb/index.ts
+++ b/nodejs/lancedb/index.ts
@@ -100,6 +100,7 @@ export {
   RecordBatchLike,
   DataLike,
   IntoVector,
+  MultiVector,
 } from "./arrow";
 export { IntoSql, packBits } from "./util";
 
diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts
index 8c62a927..35dbd4c0 100644
--- a/nodejs/lancedb/table.ts
+++ b/nodejs/lancedb/table.ts
@@ -6,9 +6,11 @@ import {
   Data,
   DataType,
   IntoVector,
+  MultiVector,
   Schema,
   dataTypeToJson,
   fromDataToBuffer,
+  isMultiVector,
   tableFromIPC,
 } from "./arrow";
 
@@ -346,7 +348,7 @@ export abstract class Table {
    * if the query is a string and no embedding function is defined, it will be treated as a full text search query
    */
   abstract search(
-    query: string | IntoVector | FullTextQuery,
+    query: string | IntoVector | MultiVector | FullTextQuery,
     queryType?: string,
     ftsColumns?: string | string[],
   ): VectorQuery | Query;
@@ -357,7 +359,7 @@ export abstract class Table {
    * is the same thing as calling `nearestTo` on the builder returned
    * by `query`.  @see {@link Query#nearestTo} for more details.
    */
-  abstract vectorSearch(vector: IntoVector): VectorQuery;
+  abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
   /**
    * Add new columns with defined values.
    * @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
   }
 
   search(
-    query: string | IntoVector | FullTextQuery,
+    query: string | IntoVector | MultiVector | FullTextQuery,
     queryType: string = "auto",
     ftsColumns?: string | string[],
   ): VectorQuery | Query {
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
     return this.query().nearestTo(queryPromise);
   }
 
-  vectorSearch(vector: IntoVector): VectorQuery {
+  vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
+    if (isMultiVector(vector)) {
+      const query = this.query().nearestTo(vector[0]);
+      for (const v of vector.slice(1)) {
+        query.addQueryVector(v);
+      }
+      return query;
+    }
+
     return this.query().nearestTo(vector);
   }