feat: support multivector for JS SDK (#2527)

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
BubbleCal
2025-07-22 21:19:34 +08:00
committed by GitHub
parent 0579303602
commit 96c66fd087
16 changed files with 262 additions and 35 deletions

12
docs/package-lock.json generated
View File

@@ -19,7 +19,7 @@
}, },
"../node": { "../node": {
"name": "vectordb", "name": "vectordb",
"version": "0.12.0", "version": "0.21.2-beta.0",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"
@@ -65,11 +65,11 @@
"uuid": "^9.0.0" "uuid": "^9.0.0"
}, },
"optionalDependencies": { "optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.12.0", "@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
"@lancedb/vectordb-darwin-x64": "0.12.0", "@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0", "@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.12.0", "@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.12.0" "@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
}, },
"peerDependencies": { "peerDependencies": {
"@apache-arrow/ts": "^14.0.2", "@apache-arrow/ts": "^14.0.2",

View File

@@ -1,7 +1,9 @@
# SQL Querying
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL. You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
This guide will show how to query Lance tables them using both. This guide will show how to query Lance tables them using both.
We will re-use the dataset [created previously](./pandas_and_pyarrow.md): We will re-use the dataset [created previously](./tables.md):
```python ```python
import lancedb import lancedb
@@ -27,15 +29,10 @@ arrow_table = table.to_lance()
duckdb.query("SELECT * FROM arrow_table") duckdb.query("SELECT * FROM arrow_table")
``` ```
``` | vector | item | price |
┌─────────────┬─────────┬────────┐ | ----------- | ---- | ----- |
│ vector │ item │ price | [3.1, 4.1] | foo | 10.0 |
│ float[] │ varchar │ double │ | [5.9, 26.5] | bar | 20.0 |
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
```
## Querying a LanceDB Table with Apache Datafusion ## Querying a LanceDB Table with Apache Datafusion
@@ -57,12 +54,7 @@ Register the table created with the Datafusion session context.
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic" --8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
``` ```
``` | vector | item | price |
┌─────────────┬─────────┬────────┐ | ----------- | ---- | ----- |
│ vector │ item │ price | [3.1, 4.1] | foo | 10.0 |
│ float[] │ varchar │ double │ | [5.9, 26.5] | bar | 20.0 |
├─────────────┼─────────┼────────┤
│ [3.1, 4.1] │ foo │ 10.0 │
│ [5.9, 26.5] │ bar │ 20.0 │
└─────────────┴─────────┴────────┘
```

View File

@@ -41,6 +41,7 @@ Creates an instance of MatchQuery.
- `fuzziness`: The fuzziness level for the query (default is 0). - `fuzziness`: The fuzziness level for the query (default is 0).
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50). - `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
- `operator`: The logical operator to use for combining terms in the query (default is "OR"). - `operator`: The logical operator to use for combining terms in the query (default is "OR").
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
* **options.boost?**: `number` * **options.boost?**: `number`
@@ -50,6 +51,8 @@ Creates an instance of MatchQuery.
* **options.operator?**: [`Operator`](../enumerations/Operator.md) * **options.operator?**: [`Operator`](../enumerations/Operator.md)
* **options.prefixLength?**: `number`
#### Returns #### Returns
[`MatchQuery`](MatchQuery.md) [`MatchQuery`](MatchQuery.md)

View File

@@ -612,7 +612,7 @@ of the given query
#### Parameters #### Parameters
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md) * **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
the query, a vector or string the query, a vector or string
* **queryType?**: `string` * **queryType?**: `string`
@@ -799,7 +799,7 @@ by `query`.
#### Parameters #### Parameters
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) * **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
#### Returns #### Returns

View File

@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.
*** ***
### maximumNprobes()
```ts
maximumNprobes(maximumNprobes): VectorQuery
```
Set the maximum number of probes used.
This controls the maximum number of partitions that will be searched. If this
number is greater than minimumNprobes then the excess partitions will _only_ be
searched if we have not found enough results. This can be useful when there is
a narrow filter to allow these queries to spend more time searching and avoid
potential false negatives.
#### Parameters
* **maximumNprobes**: `number`
#### Returns
[`VectorQuery`](VectorQuery.md)
***
### minimumNprobes()
```ts
minimumNprobes(minimumNprobes): VectorQuery
```
Set the minimum number of probes used.
This controls the minimum number of partitions that will be searched. This
parameter will impact every query against a vector index, regardless of the
filter. See `nprobes` for more details. Higher values will increase recall
but will also increase latency.
#### Parameters
* **minimumNprobes**: `number`
#### Returns
[`VectorQuery`](VectorQuery.md)
***
### nprobes() ### nprobes()
```ts ```ts
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
your actual data to find the smallest possible value that will still give your actual data to find the smallest possible value that will still give
you the desired recall. you the desired recall.
For more fine grained control over behavior when you have a very narrow filter
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
the minimum and maximum to the same value.
#### Parameters #### Parameters
* **nprobes**: `number` * **nprobes**: `number`

View File

@@ -10,6 +10,7 @@ Enum representing the occurrence of terms in full-text queries.
- `Must`: The term must be present in the document. - `Must`: The term must be present in the document.
- `Should`: The term should contribute to the document score, but is not required. - `Should`: The term should contribute to the document score, but is not required.
- `MustNot`: The term must not be present in the document.
## Enumeration Members ## Enumeration Members
@@ -21,6 +22,14 @@ Must: "MUST";
*** ***
### MustNot
```ts
MustNot: "MUST_NOT";
```
***
### Should ### Should
```ts ```ts

View File

@@ -84,6 +84,7 @@
- [FieldLike](type-aliases/FieldLike.md) - [FieldLike](type-aliases/FieldLike.md)
- [IntoSql](type-aliases/IntoSql.md) - [IntoSql](type-aliases/IntoSql.md)
- [IntoVector](type-aliases/IntoVector.md) - [IntoVector](type-aliases/IntoVector.md)
- [MultiVector](type-aliases/MultiVector.md)
- [RecordBatchLike](type-aliases/RecordBatchLike.md) - [RecordBatchLike](type-aliases/RecordBatchLike.md)
- [SchemaLike](type-aliases/SchemaLike.md) - [SchemaLike](type-aliases/SchemaLike.md)
- [TableLike](type-aliases/TableLike.md) - [TableLike](type-aliases/TableLike.md)

View File

@@ -23,7 +23,7 @@ whether to remove punctuation
### baseTokenizer? ### baseTokenizer?
```ts ```ts
optional baseTokenizer: "raw" | "simple" | "whitespace"; optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
``` ```
The tokenizer to use when building the index. The tokenizer to use when building the index.
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored
*** ***
### ngramMaxLength?
```ts
optional ngramMaxLength: number;
```
ngram max length
***
### ngramMinLength?
```ts
optional ngramMinLength: number;
```
ngram min length
***
### prefixOnly?
```ts
optional prefixOnly: boolean;
```
whether to only index the prefix of the token for ngram tokenizer
***
### removeStopWords? ### removeStopWords?
```ts ```ts

View File

@@ -24,10 +24,10 @@ The default is 7 days
// Delete all versions older than 1 day // Delete all versions older than 1 day
const olderThan = new Date(); const olderThan = new Date();
olderThan.setDate(olderThan.getDate() - 1)); olderThan.setDate(olderThan.getDate() - 1));
tbl.cleanupOlderVersions(olderThan); tbl.optimize({cleanupOlderThan: olderThan});
// Delete all versions except the current version // Delete all versions except the current version
tbl.cleanupOlderVersions(new Date()); tbl.optimize({cleanupOlderThan: new Date()});
``` ```
*** ***

View File

@@ -0,0 +1,11 @@
[**@lancedb/lancedb**](../README.md) • **Docs**
***
[@lancedb/lancedb](../globals.md) / MultiVector
# Type Alias: MultiVector
```ts
type MultiVector: IntoVector[];
```

View File

@@ -30,7 +30,8 @@ excluded_globs = [
"../src/rag/advanced_techniques/*.md", "../src/rag/advanced_techniques/*.md",
"../src/guides/scalar_index.md", "../src/guides/scalar_index.md",
"../src/guides/storage.md", "../src/guides/storage.md",
"../src/search.md" "../src/search.md",
"../src/guides/sql_querying.md",
] ]
python_prefix = "py" python_prefix = "py"

65
node/package-lock.json generated
View File

@@ -326,6 +326,71 @@
"@jridgewell/sourcemap-codec": "^1.4.10" "@jridgewell/sourcemap-codec": "^1.4.10"
} }
}, },
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.21.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.0.tgz",
"integrity": "sha512-RiYqpKuq9v8A4wFuHt1iPNFYjWJ1KgGFLJwQO4ajp9Hee84sDHq8mP0ATgMcc24hiaOUQ1lRRTULjGbHn4NIYw==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.21.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.0.tgz",
"integrity": "sha512-togdP0YIjMYg/hBRMMxW434i5VB789JWU5o3hWrodbX8olEc0Txqw5Dg9CgIOldBIiCti6uTSQiTo6uldZon1w==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.21.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.0.tgz",
"integrity": "sha512-ErS4IQDQVTYVATPeOj/dZXQR34eZQ5rAXm3vJdQi5K6X4zCDaIjOhpmnwzPBGT9W1idaBAoDJhtNfsFaJ6/PQQ==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.21.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.0.tgz",
"integrity": "sha512-ycDpyBGbfxtnGGa/RQo5+So6dHALiem1pbYc/LDKKluUJpadtXtEwC61o6hZTcejoYjhEE8ET7vA3OCEJfMFaw==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.21.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.0.tgz",
"integrity": "sha512-IgVkAP/LiNIQD5P6n/9x3bgQOt5pGJarjtSF8r+ialD95QHmo6tcxrwTy/DlA+H1uI6B6h+sbN0c1KXTh1rYcg==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": { "node_modules/@neon-rs/cli": {
"version": "0.0.160", "version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz", "resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",

View File

@@ -1863,4 +1863,43 @@ describe("column name options", () => {
expect(results[0].query_index).toBe(0); expect(results[0].query_index).toBe(0);
expect(results[1].query_index).toBe(1); expect(results[1].query_index).toBe(1);
}); });
test("index and search multivectors", async () => {
const db = await connect(tmpDir.name);
const data = [];
// generate 512 random multivectors
for (let i = 0; i < 256; i++) {
data.push({
multivector: Array.from({ length: 10 }, () =>
Array(2).fill(Math.random()),
),
});
}
const table = await db.createTable("multivectors", data, {
schema: new Schema([
new Field(
"multivector",
new List(
new Field(
"item",
new FixedSizeList(2, new Field("item", new Float32())),
),
),
),
]),
});
const results = await table.search(data[0].multivector).limit(10).toArray();
expect(results.length).toBe(10);
await table.createIndex("multivector", {
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
});
const results2 = await table
.search(data[0].multivector)
.limit(10)
.toArray();
expect(results2.length).toBe(10);
});
}); });

View File

@@ -107,6 +107,20 @@ export type IntoVector =
| number[] | number[]
| Promise<Float32Array | Float64Array | number[]>; | Promise<Float32Array | Float64Array | number[]>;
export type MultiVector = IntoVector[];
export function isMultiVector(value: unknown): value is MultiVector {
return Array.isArray(value) && isIntoVector(value[0]);
}
export function isIntoVector(value: unknown): value is IntoVector {
return (
value instanceof Float32Array ||
value instanceof Float64Array ||
(Array.isArray(value) && !Array.isArray(value[0]))
);
}
export function isArrowTable(value: object): value is TableLike { export function isArrowTable(value: object): value is TableLike {
if (value instanceof ArrowTable) return true; if (value instanceof ArrowTable) return true;
return "schema" in value && "batches" in value; return "schema" in value && "batches" in value;

View File

@@ -100,6 +100,7 @@ export {
RecordBatchLike, RecordBatchLike,
DataLike, DataLike,
IntoVector, IntoVector,
MultiVector,
} from "./arrow"; } from "./arrow";
export { IntoSql, packBits } from "./util"; export { IntoSql, packBits } from "./util";

View File

@@ -6,9 +6,11 @@ import {
Data, Data,
DataType, DataType,
IntoVector, IntoVector,
MultiVector,
Schema, Schema,
dataTypeToJson, dataTypeToJson,
fromDataToBuffer, fromDataToBuffer,
isMultiVector,
tableFromIPC, tableFromIPC,
} from "./arrow"; } from "./arrow";
@@ -346,7 +348,7 @@ export abstract class Table {
* if the query is a string and no embedding function is defined, it will be treated as a full text search query * if the query is a string and no embedding function is defined, it will be treated as a full text search query
*/ */
abstract search( abstract search(
query: string | IntoVector | FullTextQuery, query: string | IntoVector | MultiVector | FullTextQuery,
queryType?: string, queryType?: string,
ftsColumns?: string | string[], ftsColumns?: string | string[],
): VectorQuery | Query; ): VectorQuery | Query;
@@ -357,7 +359,7 @@ export abstract class Table {
* is the same thing as calling `nearestTo` on the builder returned * is the same thing as calling `nearestTo` on the builder returned
* by `query`. @see {@link Query#nearestTo} for more details. * by `query`. @see {@link Query#nearestTo} for more details.
*/ */
abstract vectorSearch(vector: IntoVector): VectorQuery; abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
/** /**
* Add new columns with defined values. * Add new columns with defined values.
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and * @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
} }
search( search(
query: string | IntoVector | FullTextQuery, query: string | IntoVector | MultiVector | FullTextQuery,
queryType: string = "auto", queryType: string = "auto",
ftsColumns?: string | string[], ftsColumns?: string | string[],
): VectorQuery | Query { ): VectorQuery | Query {
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
return this.query().nearestTo(queryPromise); return this.query().nearestTo(queryPromise);
} }
vectorSearch(vector: IntoVector): VectorQuery { vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
if (isMultiVector(vector)) {
const query = this.query().nearestTo(vector[0]);
for (const v of vector.slice(1)) {
query.addQueryVector(v);
}
return query;
}
return this.query().nearestTo(vector); return this.query().nearestTo(vector);
} }