mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-22 21:09:58 +00:00
feat: support multivector for JS SDK (#2527)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
This commit is contained in:
12
docs/package-lock.json
generated
12
docs/package-lock.json
generated
@@ -19,7 +19,7 @@
|
||||
},
|
||||
"../node": {
|
||||
"name": "vectordb",
|
||||
"version": "0.12.0",
|
||||
"version": "0.21.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -65,11 +65,11 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.12.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.12.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.21.2-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.21.2-beta.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
# SQL Querying
|
||||
|
||||
You can use DuckDB and Apache Datafusion to query your LanceDB tables using SQL.
|
||||
This guide will show how to query Lance tables them using both.
|
||||
|
||||
We will re-use the dataset [created previously](./pandas_and_pyarrow.md):
|
||||
We will re-use the dataset [created previously](./tables.md):
|
||||
|
||||
```python
|
||||
import lancedb
|
||||
@@ -27,15 +29,10 @@ arrow_table = table.to_lance()
|
||||
duckdb.query("SELECT * FROM arrow_table")
|
||||
```
|
||||
|
||||
```
|
||||
┌─────────────┬─────────┬────────┐
|
||||
│ vector │ item │ price │
|
||||
│ float[] │ varchar │ double │
|
||||
├─────────────┼─────────┼────────┤
|
||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||
└─────────────┴─────────┴────────┘
|
||||
```
|
||||
| vector | item | price |
|
||||
| ----------- | ---- | ----- |
|
||||
| [3.1, 4.1] | foo | 10.0 |
|
||||
| [5.9, 26.5] | bar | 20.0 |
|
||||
|
||||
## Querying a LanceDB Table with Apache Datafusion
|
||||
|
||||
@@ -57,12 +54,7 @@ Register the table created with the Datafusion session context.
|
||||
--8<-- "python/python/tests/docs/test_guide_tables.py:lance_sql_basic"
|
||||
```
|
||||
|
||||
```
|
||||
┌─────────────┬─────────┬────────┐
|
||||
│ vector │ item │ price │
|
||||
│ float[] │ varchar │ double │
|
||||
├─────────────┼─────────┼────────┤
|
||||
│ [3.1, 4.1] │ foo │ 10.0 │
|
||||
│ [5.9, 26.5] │ bar │ 20.0 │
|
||||
└─────────────┴─────────┴────────┘
|
||||
```
|
||||
| vector | item | price |
|
||||
| ----------- | ---- | ----- |
|
||||
| [3.1, 4.1] | foo | 10.0 |
|
||||
| [5.9, 26.5] | bar | 20.0 |
|
||||
|
||||
@@ -41,6 +41,7 @@ Creates an instance of MatchQuery.
|
||||
- `fuzziness`: The fuzziness level for the query (default is 0).
|
||||
- `maxExpansions`: The maximum number of terms to consider for fuzzy matching (default is 50).
|
||||
- `operator`: The logical operator to use for combining terms in the query (default is "OR").
|
||||
- `prefixLength`: The number of beginning characters being unchanged for fuzzy matching.
|
||||
|
||||
* **options.boost?**: `number`
|
||||
|
||||
@@ -50,6 +51,8 @@ Creates an instance of MatchQuery.
|
||||
|
||||
* **options.operator?**: [`Operator`](../enumerations/Operator.md)
|
||||
|
||||
* **options.prefixLength?**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`MatchQuery`](MatchQuery.md)
|
||||
|
||||
@@ -612,7 +612,7 @@ of the given query
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
* **query**: `string` \| [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md) \| [`FullTextQuery`](../interfaces/FullTextQuery.md)
|
||||
the query, a vector or string
|
||||
|
||||
* **queryType?**: `string`
|
||||
@@ -799,7 +799,7 @@ by `query`.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md)
|
||||
* **vector**: [`IntoVector`](../type-aliases/IntoVector.md) \| [`MultiVector`](../type-aliases/MultiVector.md)
|
||||
|
||||
#### Returns
|
||||
|
||||
|
||||
@@ -386,6 +386,53 @@ called then every valid row from the table will be returned.
|
||||
|
||||
***
|
||||
|
||||
### maximumNprobes()
|
||||
|
||||
```ts
|
||||
maximumNprobes(maximumNprobes): VectorQuery
|
||||
```
|
||||
|
||||
Set the maximum number of probes used.
|
||||
|
||||
This controls the maximum number of partitions that will be searched. If this
|
||||
number is greater than minimumNprobes then the excess partitions will _only_ be
|
||||
searched if we have not found enough results. This can be useful when there is
|
||||
a narrow filter to allow these queries to spend more time searching and avoid
|
||||
potential false negatives.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **maximumNprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`VectorQuery`](VectorQuery.md)
|
||||
|
||||
***
|
||||
|
||||
### minimumNprobes()
|
||||
|
||||
```ts
|
||||
minimumNprobes(minimumNprobes): VectorQuery
|
||||
```
|
||||
|
||||
Set the minimum number of probes used.
|
||||
|
||||
This controls the minimum number of partitions that will be searched. This
|
||||
parameter will impact every query against a vector index, regardless of the
|
||||
filter. See `nprobes` for more details. Higher values will increase recall
|
||||
but will also increase latency.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **minimumNprobes**: `number`
|
||||
|
||||
#### Returns
|
||||
|
||||
[`VectorQuery`](VectorQuery.md)
|
||||
|
||||
***
|
||||
|
||||
### nprobes()
|
||||
|
||||
```ts
|
||||
@@ -413,6 +460,10 @@ For best results we recommend tuning this parameter with a benchmark against
|
||||
your actual data to find the smallest possible value that will still give
|
||||
you the desired recall.
|
||||
|
||||
For more fine grained control over behavior when you have a very narrow filter
|
||||
you can use `minimumNprobes` and `maximumNprobes`. This method sets both
|
||||
the minimum and maximum to the same value.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **nprobes**: `number`
|
||||
|
||||
@@ -10,6 +10,7 @@ Enum representing the occurrence of terms in full-text queries.
|
||||
|
||||
- `Must`: The term must be present in the document.
|
||||
- `Should`: The term should contribute to the document score, but is not required.
|
||||
- `MustNot`: The term must not be present in the document.
|
||||
|
||||
## Enumeration Members
|
||||
|
||||
@@ -21,6 +22,14 @@ Must: "MUST";
|
||||
|
||||
***
|
||||
|
||||
### MustNot
|
||||
|
||||
```ts
|
||||
MustNot: "MUST_NOT";
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### Should
|
||||
|
||||
```ts
|
||||
|
||||
@@ -84,6 +84,7 @@
|
||||
- [FieldLike](type-aliases/FieldLike.md)
|
||||
- [IntoSql](type-aliases/IntoSql.md)
|
||||
- [IntoVector](type-aliases/IntoVector.md)
|
||||
- [MultiVector](type-aliases/MultiVector.md)
|
||||
- [RecordBatchLike](type-aliases/RecordBatchLike.md)
|
||||
- [SchemaLike](type-aliases/SchemaLike.md)
|
||||
- [TableLike](type-aliases/TableLike.md)
|
||||
|
||||
@@ -23,7 +23,7 @@ whether to remove punctuation
|
||||
### baseTokenizer?
|
||||
|
||||
```ts
|
||||
optional baseTokenizer: "raw" | "simple" | "whitespace";
|
||||
optional baseTokenizer: "raw" | "simple" | "whitespace" | "ngram";
|
||||
```
|
||||
|
||||
The tokenizer to use when building the index.
|
||||
@@ -71,6 +71,36 @@ tokens longer than this length will be ignored
|
||||
|
||||
***
|
||||
|
||||
### ngramMaxLength?
|
||||
|
||||
```ts
|
||||
optional ngramMaxLength: number;
|
||||
```
|
||||
|
||||
ngram max length
|
||||
|
||||
***
|
||||
|
||||
### ngramMinLength?
|
||||
|
||||
```ts
|
||||
optional ngramMinLength: number;
|
||||
```
|
||||
|
||||
ngram min length
|
||||
|
||||
***
|
||||
|
||||
### prefixOnly?
|
||||
|
||||
```ts
|
||||
optional prefixOnly: boolean;
|
||||
```
|
||||
|
||||
whether to only index the prefix of the token for ngram tokenizer
|
||||
|
||||
***
|
||||
|
||||
### removeStopWords?
|
||||
|
||||
```ts
|
||||
|
||||
@@ -24,10 +24,10 @@ The default is 7 days
|
||||
// Delete all versions older than 1 day
|
||||
const olderThan = new Date();
|
||||
olderThan.setDate(olderThan.getDate() - 1));
|
||||
tbl.cleanupOlderVersions(olderThan);
|
||||
tbl.optimize({cleanupOlderThan: olderThan});
|
||||
|
||||
// Delete all versions except the current version
|
||||
tbl.cleanupOlderVersions(new Date());
|
||||
tbl.optimize({cleanupOlderThan: new Date()});
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
11
docs/src/js/type-aliases/MultiVector.md
Normal file
11
docs/src/js/type-aliases/MultiVector.md
Normal file
@@ -0,0 +1,11 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / MultiVector
|
||||
|
||||
# Type Alias: MultiVector
|
||||
|
||||
```ts
|
||||
type MultiVector: IntoVector[];
|
||||
```
|
||||
@@ -30,7 +30,8 @@ excluded_globs = [
|
||||
"../src/rag/advanced_techniques/*.md",
|
||||
"../src/guides/scalar_index.md",
|
||||
"../src/guides/storage.md",
|
||||
"../src/search.md"
|
||||
"../src/search.md",
|
||||
"../src/guides/sql_querying.md",
|
||||
]
|
||||
|
||||
python_prefix = "py"
|
||||
|
||||
65
node/package-lock.json
generated
65
node/package-lock.json
generated
@@ -326,6 +326,71 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-RiYqpKuq9v8A4wFuHt1iPNFYjWJ1KgGFLJwQO4ajp9Hee84sDHq8mP0ATgMcc24hiaOUQ1lRRTULjGbHn4NIYw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-togdP0YIjMYg/hBRMMxW434i5VB789JWU5o3hWrodbX8olEc0Txqw5Dg9CgIOldBIiCti6uTSQiTo6uldZon1w==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-ErS4IQDQVTYVATPeOj/dZXQR34eZQ5rAXm3vJdQi5K6X4zCDaIjOhpmnwzPBGT9W1idaBAoDJhtNfsFaJ6/PQQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-ycDpyBGbfxtnGGa/RQo5+So6dHALiem1pbYc/LDKKluUJpadtXtEwC61o6hZTcejoYjhEE8ET7vA3OCEJfMFaw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.21.2-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.2-beta.0.tgz",
|
||||
"integrity": "sha512-IgVkAP/LiNIQD5P6n/9x3bgQOt5pGJarjtSF8r+ialD95QHmo6tcxrwTy/DlA+H1uI6B6h+sbN0c1KXTh1rYcg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@neon-rs/cli": {
|
||||
"version": "0.0.160",
|
||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||
|
||||
@@ -1863,4 +1863,43 @@ describe("column name options", () => {
|
||||
expect(results[0].query_index).toBe(0);
|
||||
expect(results[1].query_index).toBe(1);
|
||||
});
|
||||
|
||||
test("index and search multivectors", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [];
|
||||
// generate 512 random multivectors
|
||||
for (let i = 0; i < 256; i++) {
|
||||
data.push({
|
||||
multivector: Array.from({ length: 10 }, () =>
|
||||
Array(2).fill(Math.random()),
|
||||
),
|
||||
});
|
||||
}
|
||||
const table = await db.createTable("multivectors", data, {
|
||||
schema: new Schema([
|
||||
new Field(
|
||||
"multivector",
|
||||
new List(
|
||||
new Field(
|
||||
"item",
|
||||
new FixedSizeList(2, new Field("item", new Float32())),
|
||||
),
|
||||
),
|
||||
),
|
||||
]),
|
||||
});
|
||||
|
||||
const results = await table.search(data[0].multivector).limit(10).toArray();
|
||||
expect(results.length).toBe(10);
|
||||
|
||||
await table.createIndex("multivector", {
|
||||
config: Index.ivfPq({ numPartitions: 2, distanceType: "cosine" }),
|
||||
});
|
||||
|
||||
const results2 = await table
|
||||
.search(data[0].multivector)
|
||||
.limit(10)
|
||||
.toArray();
|
||||
expect(results2.length).toBe(10);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -107,6 +107,20 @@ export type IntoVector =
|
||||
| number[]
|
||||
| Promise<Float32Array | Float64Array | number[]>;
|
||||
|
||||
export type MultiVector = IntoVector[];
|
||||
|
||||
export function isMultiVector(value: unknown): value is MultiVector {
|
||||
return Array.isArray(value) && isIntoVector(value[0]);
|
||||
}
|
||||
|
||||
export function isIntoVector(value: unknown): value is IntoVector {
|
||||
return (
|
||||
value instanceof Float32Array ||
|
||||
value instanceof Float64Array ||
|
||||
(Array.isArray(value) && !Array.isArray(value[0]))
|
||||
);
|
||||
}
|
||||
|
||||
export function isArrowTable(value: object): value is TableLike {
|
||||
if (value instanceof ArrowTable) return true;
|
||||
return "schema" in value && "batches" in value;
|
||||
|
||||
@@ -100,6 +100,7 @@ export {
|
||||
RecordBatchLike,
|
||||
DataLike,
|
||||
IntoVector,
|
||||
MultiVector,
|
||||
} from "./arrow";
|
||||
export { IntoSql, packBits } from "./util";
|
||||
|
||||
|
||||
@@ -6,9 +6,11 @@ import {
|
||||
Data,
|
||||
DataType,
|
||||
IntoVector,
|
||||
MultiVector,
|
||||
Schema,
|
||||
dataTypeToJson,
|
||||
fromDataToBuffer,
|
||||
isMultiVector,
|
||||
tableFromIPC,
|
||||
} from "./arrow";
|
||||
|
||||
@@ -346,7 +348,7 @@ export abstract class Table {
|
||||
* if the query is a string and no embedding function is defined, it will be treated as a full text search query
|
||||
*/
|
||||
abstract search(
|
||||
query: string | IntoVector | FullTextQuery,
|
||||
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||
queryType?: string,
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query;
|
||||
@@ -357,7 +359,7 @@ export abstract class Table {
|
||||
* is the same thing as calling `nearestTo` on the builder returned
|
||||
* by `query`. @see {@link Query#nearestTo} for more details.
|
||||
*/
|
||||
abstract vectorSearch(vector: IntoVector): VectorQuery;
|
||||
abstract vectorSearch(vector: IntoVector | MultiVector): VectorQuery;
|
||||
/**
|
||||
* Add new columns with defined values.
|
||||
* @param {AddColumnsSql[]} newColumnTransforms pairs of column names and
|
||||
@@ -668,7 +670,7 @@ export class LocalTable extends Table {
|
||||
}
|
||||
|
||||
search(
|
||||
query: string | IntoVector | FullTextQuery,
|
||||
query: string | IntoVector | MultiVector | FullTextQuery,
|
||||
queryType: string = "auto",
|
||||
ftsColumns?: string | string[],
|
||||
): VectorQuery | Query {
|
||||
@@ -715,7 +717,15 @@ export class LocalTable extends Table {
|
||||
return this.query().nearestTo(queryPromise);
|
||||
}
|
||||
|
||||
vectorSearch(vector: IntoVector): VectorQuery {
|
||||
vectorSearch(vector: IntoVector | MultiVector): VectorQuery {
|
||||
if (isMultiVector(vector)) {
|
||||
const query = this.query().nearestTo(vector[0]);
|
||||
for (const v of vector.slice(1)) {
|
||||
query.addQueryVector(v);
|
||||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
return this.query().nearestTo(vector);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user