From 96c66fd0878165bcef39dc5f98d2a70a53030897 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 22 Jul 2025 21:19:34 +0800 Subject: [PATCH] feat: support multivector for JS SDK (#2527) Signed-off-by: BubbleCal --- docs/package-lock.json | 12 ++--- docs/src/guides/sql_querying.md | 30 ++++------- docs/src/js/classes/MatchQuery.md | 3 ++ docs/src/js/classes/Table.md | 4 +- docs/src/js/classes/VectorQuery.md | 51 ++++++++++++++++++ docs/src/js/enumerations/Occur.md | 9 ++++ docs/src/js/globals.md | 1 + docs/src/js/interfaces/FtsOptions.md | 32 ++++++++++- docs/src/js/interfaces/OptimizeOptions.md | 4 +- docs/src/js/type-aliases/MultiVector.md | 11 ++++ docs/test/md_testing.py | 3 +- node/package-lock.json | 65 +++++++++++++++++++++++ nodejs/__test__/table.test.ts | 39 ++++++++++++++ nodejs/lancedb/arrow.ts | 14 +++++ nodejs/lancedb/index.ts | 1 + nodejs/lancedb/table.ts | 18 +++++-- 16 files changed, 262 insertions(+), 35 deletions(-) create mode 100644 docs/src/js/type-aliases/MultiVector.md diff --git a/docs/package-lock.json b/docs/package-lock.json index 1baad851..e87f3e0e 100644 --- a/docs/package-lock.json +++ b/docs/package-lock.json @@ -19,7 +19,7 @@ }, "../node": { "name": "vectordb", - "version": "0.12.0", + "version": "0.21.2-beta.0", "cpu": [ "x64", "arm64" @@ -65,11 +65,11 @@ "uuid": "^9.0.0" }, "optionalDependencies": { - "@lancedb/vectordb-darwin-arm64": "0.12.0", - "@lancedb/vectordb-darwin-x64": "0.12.0", - "@lancedb/vectordb-linux-arm64-gnu": "0.12.0", - "@lancedb/vectordb-linux-x64-gnu": "0.12.0", - "@lancedb/vectordb-win32-x64-msvc": "0.12.0" + "@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0", + "@lancedb/vectordb-darwin-x64": "0.21.2-beta.0", + "@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0", + "@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0", + "@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0" }, "peerDependencies": { "@apache-arrow/ts": "^14.0.2", diff --git a/docs/src/guides/sql_querying.md b/docs/src/guides/sql_querying.md index 27cfa79a..30ca2ffe 100644 --- a/docs/src/guides/sql_querying.md +++ b/docs/src/guides/sql_querying.md @@ -1,7 +1,9 @@ +# SQL Querying + You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL. This guide will show how to query Lance tables them using both. -We will re-use the dataset [created previously](./pandas_and_pyarrow.md): +We will re-use the dataset [created previously](./tables.md): ```python import lancedb @@ -27,15 +29,10 @@ arrow_table = table.to_lance() duckdb.query("SELECT * FROM arrow_table") ``` -``` -┌─────────────┬─────────┬────────┐ -│ vector │ item │ price │ -│ float[] │ varchar │ double │ -├─────────────┼─────────┼────────┤ -│ [3.1, 4.1] │ foo │ 10.0 │ -│ [5.9, 26.5] │ bar │ 20.0 │ -└─────────────┴─────────┴────────┘ -``` +| vector | item | price | +| ----------- | ---- | ----- | +| [3.1, 4.1] | foo | 10.0 | +| [5.9, 26.5] | bar | 20.0 | ## Querying a LanceDB Table with Apache Datafusion @@ -57,12 +54,7 @@ Register the table created with the Datafusion session context. --8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic" ``` -``` -┌─────────────┬─────────┬────────┐ -│ vector │ item │ price │ -│ float[] │ varchar │ double │ -├─────────────┼─────────┼────────┤ -│ [3.1, 4.1] │ foo │ 10.0 │ -│ [5.9, 26.5] │ bar │ 20.0 │ -└─────────────┴─────────┴────────┘ -``` +| vector | item | price | +| ----------- | ---- | ----- | +| [3.1, 4.1] | foo | 10.0 | +| [5.9, 26.5] | bar | 20.0 | diff --git a/docs/src/js/classes/MatchQuery.md b/docs/src/js/classes/MatchQuery.md index e69e47ca..61fd434a 100644 --- a/docs/src/js/classes/MatchQuery.md +++ b/docs/src/js/classes/MatchQuery.md @@ -41,6 +41,7 @@ Creates an instance of MatchQuery. - `fuzziness`: The fuzziness level for the query (default is 0). - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50). - `operator`: The logical operator to use for combining terms in the query (default is "OR"). + - `prefixLength`: The number of beginning characters being unchanged for fuzzy matching. * **options.boost?**: `number` @@ -50,6 +51,8 @@ Creates an instance of MatchQuery. * **options.operator?**: [`Operator`](../enumerations/Operator.md) +* **options.prefixLength?**: `number` + #### Returns [`MatchQuery`](MatchQuery.md) diff --git a/docs/src/js/classes/Table.md b/docs/src/js/classes/Table.md index 0bad38a5..23fd8b38 100644 --- a/docs/src/js/classes/Table.md +++ b/docs/src/js/classes/Table.md @@ -612,7 +612,7 @@ of the given query #### Parameters -* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md) +* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md) the query, a vector or string * **queryType?**: `string` @@ -799,7 +799,7 @@ by `query`. #### Parameters -* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) +* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) #### Returns diff --git a/docs/src/js/classes/VectorQuery.md b/docs/src/js/classes/VectorQuery.md index 568ded42..66339774 100644 --- a/docs/src/js/classes/VectorQuery.md +++ b/docs/src/js/classes/VectorQuery.md @@ -386,6 +386,53 @@ called then every valid row from the table will be returned. *** +### maximumNprobes() + +```ts +maximumNprobes(maximumNprobes): VectorQuery +``` + +Set the maximum number of probes used. + +This controls the maximum number of partitions that will be searched. If this +number is greater than minimumNprobes then the excess partitions will _only_ be +searched if we have not found enough results. This can be useful when there is +a narrow filter to allow these queries to spend more time searching and avoid +potential false negatives. + +#### Parameters + +* **maximumNprobes**: `number` + +#### Returns + +[`VectorQuery`](VectorQuery.md) + +*** + +### minimumNprobes() + +```ts +minimumNprobes(minimumNprobes): VectorQuery +``` + +Set the minimum number of probes used. + +This controls the minimum number of partitions that will be searched. This +parameter will impact every query against a vector index, regardless of the +filter. See `nprobes` for more details. Higher values will increase recall +but will also increase latency. + +#### Parameters + +* **minimumNprobes**: `number` + +#### Returns + +[`VectorQuery`](VectorQuery.md) + +*** + ### nprobes() ```ts @@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against your actual data to find the smallest possible value that will still give you the desired recall. +For more fine grained control over behavior when you have a very narrow filter +you can use `minimumNprobes` and `maximumNprobes`. This method sets both +the minimum and maximum to the same value. + #### Parameters * **nprobes**: `number` diff --git a/docs/src/js/enumerations/Occur.md b/docs/src/js/enumerations/Occur.md index 5e84958b..506727a1 100644 --- a/docs/src/js/enumerations/Occur.md +++ b/docs/src/js/enumerations/Occur.md @@ -10,6 +10,7 @@ Enum representing the occurrence of terms in full-text queries. - `Must`: The term must be present in the document. - `Should`: The term should contribute to the document score, but is not required. +- `MustNot`: The term must not be present in the document. ## Enumeration Members @@ -21,6 +22,14 @@ Must: "MUST"; *** +### MustNot + +```ts +MustNot: "MUST_NOT"; +``` + +*** + ### Should ```ts diff --git a/docs/src/js/globals.md b/docs/src/js/globals.md index caf73804..e7e6beac 100644 --- a/docs/src/js/globals.md +++ b/docs/src/js/globals.md @@ -84,6 +84,7 @@ - [FieldLike](type-aliases/FieldLike.md) - [IntoSql](type-aliases/IntoSql.md) - [IntoVector](type-aliases/IntoVector.md) +- [MultiVector](type-aliases/MultiVector.md) - [RecordBatchLike](type-aliases/RecordBatchLike.md) - [SchemaLike](type-aliases/SchemaLike.md) - [TableLike](type-aliases/TableLike.md) diff --git a/docs/src/js/interfaces/FtsOptions.md b/docs/src/js/interfaces/FtsOptions.md index af774cb1..0e982216 100644 --- a/docs/src/js/interfaces/FtsOptions.md +++ b/docs/src/js/interfaces/FtsOptions.md @@ -23,7 +23,7 @@ whether to remove punctuation ### baseTokenizer? ```ts -optional baseTokenizer: "raw" | "simple" | "whitespace"; +optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram"; ``` The tokenizer to use when building the index. @@ -71,6 +71,36 @@ tokens longer than this length will be ignored *** +### ngramMaxLength? + +```ts +optional ngramMaxLength: number; +``` + +ngram max length + +*** + +### ngramMinLength? + +```ts +optional ngramMinLength: number; +``` + +ngram min length + +*** + +### prefixOnly? + +```ts +optional prefixOnly: boolean; +``` + +whether to only index the prefix of the token for ngram tokenizer + +*** + ### removeStopWords? ```ts diff --git a/docs/src/js/interfaces/OptimizeOptions.md b/docs/src/js/interfaces/OptimizeOptions.md index 651835a0..e2897970 100644 --- a/docs/src/js/interfaces/OptimizeOptions.md +++ b/docs/src/js/interfaces/OptimizeOptions.md @@ -24,10 +24,10 @@ The default is 7 days // Delete all versions older than 1 day const olderThan = new Date(); olderThan.setDate(olderThan.getDate() - 1)); -tbl.cleanupOlderVersions(olderThan); +tbl.optimize({cleanupOlderThan: olderThan}); // Delete all versions except the current version -tbl.cleanupOlderVersions(new Date()); +tbl.optimize({cleanupOlderThan: new Date()}); ``` *** diff --git a/docs/src/js/type-aliases/MultiVector.md b/docs/src/js/type-aliases/MultiVector.md new file mode 100644 index 00000000..760f4f8b --- /dev/null +++ b/docs/src/js/type-aliases/MultiVector.md @@ -0,0 +1,11 @@ +[**@lancedb/lancedb**](../README.md) • **Docs** + +*** + +[@lancedb/lancedb](../globals.md) / MultiVector + +# Type Alias: MultiVector + +```ts +type MultiVector: IntoVector[]; +``` diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py index 8db130c1..0bd38076 100755 --- a/docs/test/md_testing.py +++ b/docs/test/md_testing.py @@ -30,7 +30,8 @@ excluded_globs = [ "../src/rag/advanced_techniques/*.md", "../src/guides/scalar_index.md", "../src/guides/storage.md", - "../src/search.md" + "../src/search.md", + "../src/guides/sql_querying.md", ] python_prefix = "py" diff --git a/node/package-lock.json b/node/package-lock.json index 6da63ac7..36cde48b 100644 --- a/node/package-lock.json +++ b/node/package-lock.json @@ -326,6 +326,71 @@ "@jridgewell/sourcemap-codec": "^1.4.10" } }, + "node_modules/@lancedb/vectordb-darwin-arm64": { + "version": "0.21.2-beta.0", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.0.tgz", + "integrity": "sha512-RiYqpKuq9v8A4wFuHt1iPNFYjWJ1KgGFLJwQO4ajp9Hee84sDHq8mP0ATgMcc24hiaOUQ1lRRTULjGbHn4NIYw==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@lancedb/vectordb-darwin-x64": { + "version": "0.21.2-beta.0", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.0.tgz", + "integrity": "sha512-togdP0YIjMYg/hBRMMxW434i5VB789JWU5o3hWrodbX8olEc0Txqw5Dg9CgIOldBIiCti6uTSQiTo6uldZon1w==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@lancedb/vectordb-linux-arm64-gnu": { + "version": "0.21.2-beta.0", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.0.tgz", + "integrity": "sha512-ErS4IQDQVTYVATPeOj/dZXQR34eZQ5rAXm3vJdQi5K6X4zCDaIjOhpmnwzPBGT9W1idaBAoDJhtNfsFaJ6/PQQ==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@lancedb/vectordb-linux-x64-gnu": { + "version": "0.21.2-beta.0", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.0.tgz", + "integrity": "sha512-ycDpyBGbfxtnGGa/RQo5+So6dHALiem1pbYc/LDKKluUJpadtXtEwC61o6hZTcejoYjhEE8ET7vA3OCEJfMFaw==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@lancedb/vectordb-win32-x64-msvc": { + "version": "0.21.2-beta.0", + "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.0.tgz", + "integrity": "sha512-IgVkAP/LiNIQD5P6n/9x3bgQOt5pGJarjtSF8r+ialD95QHmo6tcxrwTy/DlA+H1uI6B6h+sbN0c1KXTh1rYcg==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@neon-rs/cli": { "version": "0.0.160", "resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz", diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 9d49f243..d26a04f4 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -1863,4 +1863,43 @@ describe("column name options", () => { expect(results[0].query_index).toBe(0); expect(results[1].query_index).toBe(1); }); + + test("index and search multivectors", async () => { + const db = await connect(tmpDir.name); + const data = []; + // generate 512 random multivectors + for (let i = 0; i < 256; i++) { + data.push({ + multivector: Array.from({ length: 10 }, () => + Array(2).fill(Math.random()), + ), + }); + } + const table = await db.createTable("multivectors", data, { + schema: new Schema([ + new Field( + "multivector", + new List( + new Field( + "item", + new FixedSizeList(2, new Field("item", new Float32())), + ), + ), + ), + ]), + }); + + const results = await table.search(data[0].multivector).limit(10).toArray(); + expect(results.length).toBe(10); + + await table.createIndex("multivector", { + config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }), + }); + + const results2 = await table + .search(data[0].multivector) + .limit(10) + .toArray(); + expect(results2.length).toBe(10); + }); }); diff --git a/nodejs/lancedb/arrow.ts b/nodejs/lancedb/arrow.ts index 852e14df..7bf2bb40 100644 --- a/nodejs/lancedb/arrow.ts +++ b/nodejs/lancedb/arrow.ts @@ -107,6 +107,20 @@ export type IntoVector = | number[] | Promise; +export type MultiVector = IntoVector[]; + +export function isMultiVector(value: unknown): value is MultiVector { + return Array.isArray(value) && isIntoVector(value[0]); +} + +export function isIntoVector(value: unknown): value is IntoVector { + return ( + value instanceof Float32Array || + value instanceof Float64Array || + (Array.isArray(value) && !Array.isArray(value[0])) + ); +} + export function isArrowTable(value: object): value is TableLike { if (value instanceof ArrowTable) return true; return "schema" in value && "batches" in value; diff --git a/nodejs/lancedb/index.ts b/nodejs/lancedb/index.ts index 0750a48c..0bf4f9f5 100644 --- a/nodejs/lancedb/index.ts +++ b/nodejs/lancedb/index.ts @@ -100,6 +100,7 @@ export { RecordBatchLike, DataLike, IntoVector, + MultiVector, } from "./arrow"; export { IntoSql, packBits } from "./util"; diff --git a/nodejs/lancedb/table.ts b/nodejs/lancedb/table.ts index 8c62a927..35dbd4c0 100644 --- a/nodejs/lancedb/table.ts +++ b/nodejs/lancedb/table.ts @@ -6,9 +6,11 @@ import { Data, DataType, IntoVector, + MultiVector, Schema, dataTypeToJson, fromDataToBuffer, + isMultiVector, tableFromIPC, } from "./arrow"; @@ -346,7 +348,7 @@ export abstract class Table { * if the query is a string and no embedding function is defined, it will be treated as a full text search query */ abstract search( - query: string | IntoVector | FullTextQuery, + query: string | IntoVector | MultiVector | FullTextQuery, queryType?: string, ftsColumns?: string | string[], ): VectorQuery | Query; @@ -357,7 +359,7 @@ export abstract class Table { * is the same thing as calling `nearestTo` on the builder returned * by `query`. @see {@link Query#nearestTo} for more details. */ - abstract vectorSearch(vector: IntoVector): VectorQuery; + abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery; /** * Add new columns with defined values. * @param {AddColumnsSql[]} newColumnTransforms pairs of column names and @@ -668,7 +670,7 @@ export class LocalTable extends Table { } search( - query: string | IntoVector | FullTextQuery, + query: string | IntoVector | MultiVector | FullTextQuery, queryType: string = "auto", ftsColumns?: string | string[], ): VectorQuery | Query { @@ -715,7 +717,15 @@ export class LocalTable extends Table { return this.query().nearestTo(queryPromise); } - vectorSearch(vector: IntoVector): VectorQuery { + vectorSearch(vector: IntoVector | MultiVector): VectorQuery { + if (isMultiVector(vector)) { + const query = this.query().nearestTo(vector[0]); + for (const v of vector.slice(1)) { + query.addQueryVector(v); + } + return query; + } + return this.query().nearestTo(vector); }