Compare commits

...

27 Commits

Author SHA1 Message Date
Lance Release
3dc1803c07 Bump version: 0.18.0 → 0.18.1-beta.0 2025-01-17 04:37:23 +00:00
BubbleCal
d0501f65f1 fix: linear reranker applies wrong score to combine (#2035)
related to #2014 
this fixes:
- linear reranker may lost some results if the merging consumes all
vector results earlier than fts results
- linear reranker inverts the fts score but only vector distance can be
inverted

---------

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-17 11:33:48 +08:00
Bert
4703cc6894 chore: upgrade lance to v0.22.1-beta.3 (#2038) 2025-01-16 12:42:42 -05:00
BubbleCal
493f9ce467 fix: can't infer the vector column for multivector (#2026)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-16 14:08:04 +08:00
Weston Pace
5c759505b8 feat: upgrade lance 0.22.1b1 (#2029)
Now the version actually exists :)
2025-01-15 07:37:37 -08:00
BubbleCal
bb6a39727e fix: missing distance type for auto index on RemoteTable (#2027)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-15 20:28:55 +08:00
BubbleCal
d57bed90e5 docs: add missing example code (#2025) 2025-01-14 21:17:05 -08:00
BubbleCal
648327e90c docs: show how to pack bits for binary vector (#2020)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-14 09:00:57 -08:00
Lance Release
6c7e81ee57 Updating package-lock.json 2025-01-14 02:14:37 +00:00
Lance Release
905e9d4738 Updating package-lock.json 2025-01-14 01:03:49 +00:00
Lance Release
38642e349c Updating package-lock.json 2025-01-14 01:03:33 +00:00
Lance Release
6879861ea8 Bump version: 0.15.0-beta.1 → 0.15.0 2025-01-14 01:03:04 +00:00
Lance Release
88325e488e Bump version: 0.15.0-beta.0 → 0.15.0-beta.1 2025-01-14 01:02:59 +00:00
Lance Release
995bd9bf37 Bump version: 0.18.0-beta.1 → 0.18.0 2025-01-14 01:02:26 +00:00
Lance Release
36cc06697f Bump version: 0.18.0-beta.0 → 0.18.0-beta.1 2025-01-14 01:02:25 +00:00
Will Jones
35da464591 ci: fix stable check (#2019) 2025-01-13 17:01:54 -08:00
Will Jones
31f9c30ffb chore: fix test of error message (#2018)
Addresses failure on `main`:
https://github.com/lancedb/lancedb/actions/runs/12757756657/job/35558683317
2025-01-13 15:36:46 -08:00
Will Jones
92dcf24b0c feat: upgrade Lance to v0.22.0 (#2017)
Upstream changelog:
https://github.com/lancedb/lance/releases/tag/v0.22.0
2025-01-13 15:06:01 -08:00
Will Jones
6b0adba2d9 chore: add deprecation warning to vectordb (#2003) 2025-01-13 14:53:12 -08:00
BubbleCal
66cbf6b6c5 feat: support multivector type (#2005)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-13 14:10:40 -08:00
Keming
ce9506db71 docs(hnsw): fix markdown list style (#2015) 2025-01-13 08:53:13 -08:00
Prashant Dixit
b66cd943a7 fix: broken voyageai embedding API (#2013)
This PR fixes the broken Embedding API for Voyageai.
2025-01-13 08:52:38 -08:00
Weston Pace
d8d11f48e7 feat: upgrade to lance 0.22.0b1 (#2011) 2025-01-10 12:51:52 -08:00
Lance Release
7ec5df3022 Updating package-lock.json 2025-01-10 19:58:10 +00:00
Lance Release
b17304172c Updating package-lock.json 2025-01-10 19:02:31 +00:00
Lance Release
fbe5408434 Updating package-lock.json 2025-01-10 19:02:15 +00:00
Lance Release
3f3f845c5a Bump version: 0.14.2-beta.0 → 0.15.0-beta.0 2025-01-10 19:01:47 +00:00
42 changed files with 565 additions and 374 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.14.2-beta.0"
current_version = "0.15.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -21,16 +21,16 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.21.1", "features" = [
lance = { "version" = "=0.22.1", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-io = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-index = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-linalg = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-table = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-testing = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-datafusion = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-encoding = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
], git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-io = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-index = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-linalg = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-table = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-testing = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-datafusion = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
lance-encoding = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
# Note that this one does not include pyarrow
arrow = { version = "53.2", optional = false }
arrow-array = "53.2"
@@ -42,8 +42,8 @@ arrow-arith = "53.2"
arrow-cast = "53.2"
async-trait = "0"
chrono = "0.4.35"
datafusion-common = "42.0"
datafusion-physical-plan = "42.0"
datafusion-common = "44.0"
datafusion-physical-plan = "44.0"
env_logger = "0.10"
half = { "version" = "=2.4.1", default-features = false, features = [
"num-traits",

View File

@@ -12,7 +12,7 @@ with open("Cargo.toml", "rb") as f:
elif isinstance(dep, dict):
# Version doesn't have the beta tag in it, so we instead look
# at the git tag.
version = dep["tag"]
version = dep.get('tag', dep.get('version'))
else:
raise ValueError("Unexpected type for dependency: " + str(dep))

BIN
docs/src/assets/maxsim.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -7,7 +7,7 @@ Approximate Nearest Neighbor (ANN) search is a method for finding data points ne
There are three main types of ANN search algorithms:
* **Tree-based search algorithms**: Use a tree structure to organize and store data points.
* * **Hash-based search algorithms**: Use a specialized geometric hash table to store and manage data points. These algorithms typically focus on theoretical guarantees, and don't usually perform as well as the other approaches in practice.
* **Hash-based search algorithms**: Use a specialized geometric hash table to store and manage data points. These algorithms typically focus on theoretical guarantees, and don't usually perform as well as the other approaches in practice.
* **Graph-based search algorithms**: Use a graph structure to store data points, which can be a bit complex.
HNSW is a graph-based algorithm. All graph-based search algorithms rely on the idea of a k-nearest neighbor (or k-approximate nearest neighbor) graph, which we outline below.

View File

@@ -138,6 +138,36 @@ LanceDB supports binary vectors as a data type, and has the ability to search bi
--8<-- "python/python/tests/docs/test_binary_vector.py:async_binary_vector"
```
## Multivector type
LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali).
You can index on a column with multivector type and search on it, the query can be single vector or multiple vectors. If the query is multiple vectors `mq`, the similarity (distance) from it to any multivector `mv` in the dataset, is defined as:
![maxsim](assets/maxsim.png)
where `sim` is the similarity function (e.g. cosine).
For now, only `cosine` metric is supported for multivector search.
=== "Python"
=== "sync API"
```python
--8<-- "python/python/tests/docs/test_multivector.py:imports"
--8<-- "python/python/tests/docs/test_multivector.py:sync_multivector"
```
=== "async API"
```python
--8<-- "python/python/tests/docs/test_multivector.py:imports"
--8<-- "python/python/tests/docs/test_multivector.py:async_multivector"
```
## Search with distance range
You can also search for vectors within a specific distance range from the query vector. This is useful when you want to find vectors that are not just the nearest neighbors, but also those that are within a certain distance. This can be done by using the `distance_range` method.

View File

@@ -18,7 +18,7 @@ import numpy as np
uri = "data/sample-lancedb"
data = [{"vector": row, "item": f"item {i}", "id": i}
for i, row in enumerate(np.random.random((10_000, 2)).astype('int'))]
for i, row in enumerate(np.random.random((10_000, 2)))]
# Synchronous client
db = lancedb.connect(uri)

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.14.2-beta.0</version>
<version>0.15.0-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.14.2-beta.0</version>
<version>0.15.0-final.0</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

View File

@@ -2,6 +2,9 @@
A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb).
**DEPRECATED: This library is deprecated. Please use the new client,
[@lancedb/lancedb](https://www.npmjs.com/package/@lancedb/lancedb).**
## Installation
```bash

75
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"cpu": [
"x64",
"arm64"
@@ -52,14 +52,14 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.14.2-beta.0",
"@lancedb/vectordb-darwin-x64": "0.14.2-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.14.2-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.14.2-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.14.2-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.14.2-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.14.2-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.14.2-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.15.0",
"@lancedb/vectordb-darwin-x64": "0.15.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -330,9 +330,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.14.2-beta.0.tgz",
"integrity": "sha512-nsXOl9M8jhsr/LrfvrVHiuWWj/zX3zU2Aahpw8etjJbnU83nmO1r9agPxN6mD/J60EsLP3gDaiRPaFY66pHScA==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0.tgz",
"integrity": "sha512-FnBRsCrxvecjhkMQus9M9RQpXyhu1jxQjYGDaqqRIfcUd3ew7ahIR4qk9FyALHmjpPd72xJZgNLjliHtsIX4/w==",
"cpu": [
"arm64"
],
@@ -343,9 +343,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.14.2-beta.0.tgz",
"integrity": "sha512-E1ouo0EfGaxG26YWnw717vaHGNLulmqzh6eaTQuj45Vd4GaPj07TJygtDyvMFBJdsZjdY5YIc9U8yIem1NfeKQ==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0.tgz",
"integrity": "sha512-zy+nt1WBCabVI16u2t3sqGUXBOmnF5ZXMsHa9TWYEXVnbw5112K7/1783DTNA/ZBI/WziUa5jqYQ0GOwkgruqA==",
"cpu": [
"x64"
],
@@ -356,9 +356,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.14.2-beta.0.tgz",
"integrity": "sha512-SewXZLGccZUkONACHHPCW1Z7xsz8MaXifwpaWMEyIzbQBFAIMq30lPZN63bTt/zNo6BcBPv54yz6n1ZfCv5l+w==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0.tgz",
"integrity": "sha512-2Pbw+z5Ij5QBvmBxmjaT5F2lNHftVWlarDM1bDc4JtgodJ3Js729qnVLQ0yehnlt+hM6aGFEyn8bH5vf6gEvpQ==",
"cpu": [
"arm64"
],
@@ -369,9 +369,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-musl": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.14.2-beta.0.tgz",
"integrity": "sha512-ppq3P2QYxPHmikY6nbWTwMhDGP+e+feqzm4iXKhpBxzHR2XwoY5CtDKgKDfEHy1FyCoIyvh2yYT2M1TSkrkOBw==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0.tgz",
"integrity": "sha512-WIvgd2EY2maCdYNHPC0C9RprjNWL83FkQKtn591xixltFk3XKgvBQ2USZW2tXndH/WVdvFQvystmZ3dgUrh8DQ==",
"cpu": [
"arm64"
],
@@ -382,9 +382,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.14.2-beta.0.tgz",
"integrity": "sha512-XgkoarmdS42fLMMqNdHTVja2z7a0/Q4h3X+n14Ph/pkYsb7pmOabV4a7+ej8KJPm1wv2GmDA4GXcFPjF0tFBFA==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0.tgz",
"integrity": "sha512-Pet3aPE+yQT13Gm0+fh11pgHvImS4X8Uf0zRdzsx0eja7x8j15VrVcZTEVTT4QdBNiZrhXBuiq482NJBsqe6vw==",
"cpu": [
"x64"
],
@@ -395,9 +395,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-musl": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.14.2-beta.0.tgz",
"integrity": "sha512-vGgUOVb43eccF0oz2YJK+Zionwk4ODelHU7icmGeVsULkkFkoAbf0nO4PY38ZAeLsodnLxHIIu51Bd4Jm9m20w==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0.tgz",
"integrity": "sha512-BC1RvIoEmyOr7ENp618vs9F05gdN7aKlToJNZnGIoi++hRZ25y39B1xxMXQHDnUL8G+Ur9kJObfQ43nVWqueTQ==",
"cpu": [
"x64"
],
@@ -407,10 +407,23 @@
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-arm64-msvc": {
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0.tgz",
"integrity": "sha512-H9BeryZl1aLxldtVP0XyiQJyzKStkuxS6SmIg+zaANr9Dns+LmVxYCz429JLC0DlvBWoYjTfK9WJTgMSZXr0Cg==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"win32"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.14.2-beta.0.tgz",
"integrity": "sha512-zGLC382V3gE1MHQpf0XTe34yiB+6ZtSIuOFMIDEZVI5PVN5XkXULMY6dlt5fvo4IxhRoscGjpmmaNxJzUwigDg==",
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0.tgz",
"integrity": "sha512-J8JICux2M82OR27i/4YAbEPlvszuE7EnGIU5jmm2+RTFaptKOCshH1C4D4jEXDAaHcUkVgsxyc9lGmGJCkGLhg==",
"cpu": [
"x64"
],

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"description": " Serverless, low-latency vector database for AI applications",
"private": false,
"main": "dist/index.js",
@@ -92,13 +92,13 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.14.2-beta.0",
"@lancedb/vectordb-darwin-arm64": "0.14.2-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.14.2-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.14.2-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.14.2-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.14.2-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.14.2-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.14.2-beta.0"
"@lancedb/vectordb-darwin-x64": "0.15.0",
"@lancedb/vectordb-darwin-arm64": "0.15.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.14.2-beta.0"
version = "0.15.0"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -767,7 +767,9 @@ describe("When creating an index", () => {
)
.column("vec")
.toArrow(),
).rejects.toThrow(/.* query dim=64, expected vector dim=32.*/);
).rejects.toThrow(
/.* query dim\(64\) doesn't match the column vec vector dim\(32\).*/,
);
const query64 = Array(64)
.fill(1)

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

151
nodejs/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.14.2-beta.0",
"version": "0.15.0",
"cpu": [
"x64",
"arm64"
@@ -18,7 +18,6 @@
"win32"
],
"dependencies": {
"@lancedb/lancedb": "^0.14.1",
"reflect-metadata": "^0.2.2"
},
"devDependencies": {
@@ -4150,152 +4149,6 @@
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@lancedb/lancedb": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb/-/lancedb-0.14.1.tgz",
"integrity": "sha512-DfJ887t52n/2s8G1JnzE7gAR4i7UnfP1OjDYnJ4yTk0aIcn76CbVOUegYfURYlYjL+QFdI1MrAzUdMgYgsGGcA==",
"cpu": [
"x64",
"arm64"
],
"license": "Apache 2.0",
"os": [
"darwin",
"linux",
"win32"
],
"dependencies": {
"reflect-metadata": "^0.2.2"
},
"engines": {
"node": ">= 18"
},
"optionalDependencies": {
"@lancedb/lancedb-darwin-arm64": "0.14.1",
"@lancedb/lancedb-darwin-x64": "0.14.1",
"@lancedb/lancedb-linux-arm64-gnu": "0.14.1",
"@lancedb/lancedb-linux-arm64-musl": "0.14.1",
"@lancedb/lancedb-linux-x64-gnu": "0.14.1",
"@lancedb/lancedb-linux-x64-musl": "0.14.1",
"@lancedb/lancedb-win32-arm64-msvc": "0.14.1",
"@lancedb/lancedb-win32-x64-msvc": "0.14.1"
},
"peerDependencies": {
"apache-arrow": ">=15.0.0 <=18.1.0"
}
},
"node_modules/@lancedb/lancedb-darwin-arm64": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-arm64/-/lancedb-darwin-arm64-0.14.1.tgz",
"integrity": "sha512-eSWV3GydXfyaptPXZ+S3BgXY1YI26oHQDekACaVevRW6/YQD7sS9UhhSZn1mYyDtLTfJu2kOK2XHA9UY8nyuTg==",
"cpu": [
"arm64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-darwin-x64": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-x64/-/lancedb-darwin-x64-0.14.1.tgz",
"integrity": "sha512-ecf50ykF9WCWmpwAjs3Mk2mph7d+rMJ9EVJeX0UJ4KHDC874lnTDo6Tfd9iUcbExtNI1KZbu+CFnYsbQU+R0gw==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-arm64-gnu": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.14.1.tgz",
"integrity": "sha512-X7ub1fOm7jZ19KFW/u3nDyFvj5XzDPqEVrp9mmcOgSrst3NJEGGBz1JypkLnTWpg/7IpCBs1UO1G7R7LEsHYOA==",
"cpu": [
"arm64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-arm64-musl": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-musl/-/lancedb-linux-arm64-musl-0.14.1.tgz",
"integrity": "sha512-rkiWpsQCXwybwEjcdFXkAeGahiLcK/NQUjZc9WBY6CKk2Y9dICIafYzxZ6MDCY19jeJIgs3JS0mjleUWYr3JFw==",
"cpu": [
"arm64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-x64-gnu": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-gnu/-/lancedb-linux-x64-gnu-0.14.1.tgz",
"integrity": "sha512-LGp4D58pQJ3+H3GncNxWHkvhIVOKpTzYUBtVfC8he1rwZ6+CiYDyK9Sim/j8o3UJlJ7cP0m3gNUzPfQchQF9WA==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-x64-musl": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-musl/-/lancedb-linux-x64-musl-0.14.1.tgz",
"integrity": "sha512-V/TeoyKUESPL/8L1z4WLbMFe5ZEv4gtxc0AFK8ghiduFYN/Hckuj4oTo/Y0ysLiBx1At9FCa91hWDB301ibHBg==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-win32-x64-msvc": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-win32-x64-msvc/-/lancedb-win32-x64-msvc-0.14.1.tgz",
"integrity": "sha512-4M8D0j8/3WZv4CKo+Z44sISKPCKWN5MWA0dcEEGw4sEXHF2RJLrMIOOgEpT5NF7VW+X4t2JJxUA6j2T3cXaD8w==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@napi-rs/cli": {
"version": "2.18.3",
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-2.18.3.tgz",

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.14.2-beta.0",
"version": "0.15.0",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.18.0-beta.0"
current_version = "0.18.1-beta.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.18.0-beta.0"
version = "0.18.1-beta.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -4,7 +4,7 @@ name = "lancedb"
dynamic = ["version"]
dependencies = [
"deprecation",
"pylance==0.21.1b1",
"pylance==0.22.1b3",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",
@@ -53,7 +53,7 @@ tests = [
"pytz",
"polars>=0.19, <=1.3.0",
"tantivy",
"pyarrow-stubs"
"pyarrow-stubs",
]
dev = ["ruff", "pre-commit", "pyright"]
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]

View File

@@ -59,7 +59,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
.create(name="voyage-3")
class TextModel(LanceModel):
data: str = voyageai.SourceField()
text: str = voyageai.SourceField()
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
data = [ { "text": "hello world" },
@@ -74,6 +74,14 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
name: str
client: ClassVar = None
text_embedding_models: list = [
"voyage-3",
"voyage-3-lite",
"voyage-finance-2",
"voyage-law-2",
"voyage-code-2",
]
multimodal_embedding_models: list = ["voyage-multimodal-3"]
def ndims(self):
if self.name == "voyage-3-lite":
@@ -115,13 +123,14 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
truncation: Optional[bool]
"""
if self.name in ["voyage-multimodal-3"]:
rs = VoyageAIEmbeddingFunction._get_client().multimodal_embed(
inputs=[[text]], model=self.name, **kwargs
)
client = VoyageAIEmbeddingFunction._get_client()
if self.name in self.text_embedding_models:
rs = client.embed(texts=[text], model=self.name, **kwargs)
elif self.name in self.multimodal_embedding_models:
rs = client.multimodal_embed(inputs=[[text]], model=self.name, **kwargs)
else:
rs = VoyageAIEmbeddingFunction._get_client().embed(
texts=[text], model=self.name, **kwargs
raise ValueError(
f"Model {self.name} not supported to generate text embeddings"
)
return rs.embeddings[0]

View File

@@ -1741,12 +1741,14 @@ class AsyncQuery(AsyncQueryBase):
a default `limit` of 10 will be used.
Typically, a single vector is passed in as the query. However, you can also
pass in multiple vectors. This can be useful if you want to find the nearest
vectors to multiple query vectors. This is not expected to be faster than
making multiple queries concurrently; it is just a convenience method.
If multiple vectors are passed in then an additional column `query_index`
will be added to the results. This column will contain the index of the
query vector that the result is nearest to.
pass in multiple vectors. When multiple vectors are passed in, if the vector
column is with multivector type, then the vectors will be treated as a single
query. Or the vectors will be treated as multiple queries, this can be useful
if you want to find the nearest vectors to multiple query vectors.
This is not expected to be faster than making multiple queries concurrently;
it is just a convenience method. If multiple vectors are passed in then
an additional column `query_index` will be added to the results. This column
will contain the index of the query vector that the result is nearest to.
"""
if query_vector is None:
raise ValueError("query_vector can not be None")

View File

@@ -11,6 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from numpy import nan
import pyarrow as pa
@@ -95,43 +96,22 @@ class LinearCombinationReranker(Reranker):
pa.array([nan] * len(vector_results), type=pa.float32()),
)
return results
# sort both input tables on _rowid
combined_list = []
vector_list = vector_results.sort_by("_rowid").to_pylist()
fts_list = fts_results.sort_by("_rowid").to_pylist()
i, j = 0, 0
while i < len(vector_list):
if j >= len(fts_list):
for vi in vector_list[i:]:
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
combined_list.append(vi)
break
vi = vector_list[i]
fj = fts_list[j]
# invert the fts score from relevance to distance
inverted_fts_score = self._invert_score(fj["_score"])
if vi["_rowid"] == fj["_rowid"]:
vi["_relevance_score"] = self._combine_score(
vi["_distance"], inverted_fts_score
)
vi["_score"] = fj["_score"] # keep the original score
combined_list.append(vi)
i += 1
j += 1
elif vector_list[i]["_rowid"] < fts_list[j]["_rowid"]:
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
combined_list.append(vi)
i += 1
results = defaultdict()
for vector_result in vector_results.to_pylist():
results[vector_result["_rowid"]] = vector_result
for fts_result in fts_results.to_pylist():
row_id = fts_result["_rowid"]
if row_id in results:
results[row_id]["_score"] = fts_result["_score"]
else:
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
combined_list.append(fj)
j += 1
if j < len(fts_list) - 1:
for fj in fts_list[j:]:
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
combined_list.append(fj)
results[row_id] = fts_result
combined_list = []
for row_id, result in results.items():
vector_score = self._invert_score(result.get("_distance", fill))
fts_score = result.get("_score", fill)
result["_relevance_score"] = self._combine_score(vector_score, fts_score)
combined_list.append(result)
relevance_score_schema = pa.schema(
[
@@ -148,10 +128,10 @@ class LinearCombinationReranker(Reranker):
tbl = self._keep_relevance_score(tbl)
return tbl
def _combine_score(self, score1, score2):
def _combine_score(self, vector_score, fts_score):
# these scores represent distance
return 1 - (self.weight * score1 + (1 - self.weight) * score2)
return 1 - (self.weight * vector_score + (1 - self.weight) * fts_score)
def _invert_score(self, score: float):
def _invert_score(self, dist: float):
# Invert the score between relevance and distance
return 1 - score
return 1 - dist

View File

@@ -2856,6 +2856,8 @@ class AsyncTable:
async_query = async_query.with_row_id()
if query.vector:
# we need the schema to get the vector column type
# to determine whether the vectors is batch queries or not
async_query = (
async_query.nearest_to(query.vector)
.distance_type(query.metric)

View File

@@ -223,7 +223,7 @@ def inf_vector_column_query(schema: pa.Schema) -> str:
vector_col_count = 0
for field_name in schema.names:
field = schema.field(field_name)
if pa.types.is_fixed_size_list(field.type):
if is_vector_column(field.type):
vector_col_count += 1
if vector_col_count > 1:
raise ValueError(
@@ -231,7 +231,6 @@ def inf_vector_column_query(schema: pa.Schema) -> str:
"Please specify the vector column name "
"for vector search"
)
break
elif vector_col_count == 1:
vector_col_name = field_name
if vector_col_count == 0:
@@ -242,6 +241,29 @@ def inf_vector_column_query(schema: pa.Schema) -> str:
return vector_col_name
def is_vector_column(data_type: pa.DataType) -> bool:
"""
Check if the column is a vector column.
Parameters
----------
data_type : pa.DataType
The data type of the column.
Returns
-------
bool: True if the column is a vector column.
"""
if pa.types.is_fixed_size_list(data_type) and (
pa.types.is_floating(data_type.value_type)
or pa.types.is_uint8(data_type.value_type)
):
return True
elif pa.types.is_list(data_type):
return is_vector_column(data_type.value_type)
return False
def infer_vector_column_name(
schema: pa.Schema,
query_type: str,

View File

@@ -3,6 +3,7 @@ import shutil
# --8<-- [start:imports]
import lancedb
import numpy as np
import pyarrow as pa
import pytest
# --8<-- [end:imports]
@@ -12,16 +13,32 @@ shutil.rmtree("data/binary_lancedb", ignore_errors=True)
def test_binary_vector():
# --8<-- [start:sync_binary_vector]
db = lancedb.connect("data/binary_lancedb")
data = [
{
"id": i,
"vector": np.random.randint(0, 256, size=16),
}
for i in range(1024)
]
tbl = db.create_table("my_binary_vectors", data=data)
query = np.random.randint(0, 256, size=16)
tbl.search(query).metric("hamming").to_arrow()
schema = pa.schema(
[
pa.field("id", pa.int64()),
# for dim=256, lance stores every 8 bits in a byte
# so the vector field should be a list of 256 / 8 = 32 bytes
pa.field("vector", pa.list_(pa.uint8(), 32)),
]
)
tbl = db.create_table("my_binary_vectors", schema=schema)
data = []
for i in range(1024):
vector = np.random.randint(0, 2, size=256)
# pack the binary vector into bytes to save space
packed_vector = np.packbits(vector)
data.append(
{
"id": i,
"vector": packed_vector,
}
)
tbl.add(data)
query = np.random.randint(0, 2, size=256)
packed_query = np.packbits(query)
tbl.search(packed_query).metric("hamming").to_arrow()
# --8<-- [end:sync_binary_vector]
db.drop_table("my_binary_vectors")
@@ -30,15 +47,31 @@ def test_binary_vector():
async def test_binary_vector_async():
# --8<-- [start:async_binary_vector]
db = await lancedb.connect_async("data/binary_lancedb")
data = [
{
"id": i,
"vector": np.random.randint(0, 256, size=16),
}
for i in range(1024)
]
tbl = await db.create_table("my_binary_vectors", data=data)
query = np.random.randint(0, 256, size=16)
await tbl.query().nearest_to(query).distance_type("hamming").to_arrow()
schema = pa.schema(
[
pa.field("id", pa.int64()),
# for dim=256, lance stores every 8 bits in a byte
# so the vector field should be a list of 256 / 8 = 32 bytes
pa.field("vector", pa.list_(pa.uint8(), 32)),
]
)
tbl = await db.create_table("my_binary_vectors", schema=schema)
data = []
for i in range(1024):
vector = np.random.randint(0, 2, size=256)
# pack the binary vector into bytes to save space
packed_vector = np.packbits(vector)
data.append(
{
"id": i,
"vector": packed_vector,
}
)
await tbl.add(data)
query = np.random.randint(0, 2, size=256)
packed_query = np.packbits(query)
await tbl.query().nearest_to(packed_query).distance_type("hamming").to_arrow()
# --8<-- [end:async_binary_vector]
await db.drop_table("my_binary_vectors")

View File

@@ -0,0 +1,77 @@
import shutil
from lancedb.index import IvfPq
import pytest
# --8<-- [start:imports]
import lancedb
import numpy as np
import pyarrow as pa
# --8<-- [end:imports]
shutil.rmtree("data/multivector_demo", ignore_errors=True)
def test_multivector():
# --8<-- [start:sync_multivector]
db = lancedb.connect("data/multivector_demo")
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
]
)
data = [
{
"id": i,
"vector": np.random.random(size=(2, 256)).tolist(),
}
for i in range(1024)
]
tbl = db.create_table("my_table", data=data, schema=schema)
# only cosine similarity is supported for multi-vectors
tbl.create_index(metric="cosine")
# query with single vector
query = np.random.random(256)
tbl.search(query).to_arrow()
# query with multiple vectors
query = np.random.random(size=(2, 256))
tbl.search(query).to_arrow()
# --8<-- [end:sync_multivector]
db.drop_table("my_table")
@pytest.mark.asyncio
async def test_multivector_async():
# --8<-- [start:async_multivector]
db = await lancedb.connect_async("data/multivector_demo")
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
]
)
data = [
{
"id": i,
"vector": np.random.random(size=(2, 256)).tolist(),
}
for i in range(1024)
]
tbl = await db.create_table("my_table", data=data, schema=schema)
# only cosine similarity is supported for multi-vectors
await tbl.create_index(column="vector", config=IvfPq(distance_type="cosine"))
# query with single vector
query = np.random.random(256)
await tbl.query().nearest_to(query).to_arrow()
# query with multiple vectors
query = np.random.random(size=(2, 256))
# --8<-- [end:async_multivector]
await db.drop_table("my_table")

View File

@@ -68,6 +68,60 @@ async def table_struct_async(tmp_path) -> AsyncTable:
return await conn.create_table("test_struct", table)
@pytest.fixture
def multivec_table() -> lancedb.table.Table:
db = lancedb.connect("memory://")
# Generate 256 rows of data
num_rows = 256
# Generate data for each column
vector_data = [
[[i, i + 1], [i + 2, i + 3]] for i in range(num_rows)
] # Adjust to match nested structure
id_data = list(range(1, num_rows + 1))
float_field_data = [float(i) for i in range(1, num_rows + 1)]
# Create the Arrow table
df = pa.table(
{
"vector": pa.array(
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
),
"id": pa.array(id_data),
"float_field": pa.array(float_field_data),
}
)
return db.create_table("test", df)
@pytest_asyncio.fixture
async def multivec_table_async(tmp_path) -> AsyncTable:
conn = await lancedb.connect_async(
"memory://", read_consistency_interval=timedelta(seconds=0)
)
# Generate 256 rows of data
num_rows = 256
# Generate data for each column
vector_data = [
[[i, i + 1], [i + 2, i + 3]] for i in range(num_rows)
] # Adjust to match nested structure
id_data = list(range(1, num_rows + 1))
float_field_data = [float(i) for i in range(1, num_rows + 1)]
# Create the Arrow table
df = pa.table(
{
"vector": pa.array(
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
),
"id": pa.array(id_data),
"float_field": pa.array(float_field_data),
}
)
return await conn.create_table("test_async", df)
def test_cast(table):
class TestModel(LanceModel):
vector: Vector(2)
@@ -177,6 +231,62 @@ async def test_distance_range_async(table_async: AsyncTable):
assert res["_distance"].to_pylist() == [min_dist, max_dist]
def test_multivector(multivec_table: lancedb.table.Table):
# create index on multivector
multivec_table.create_index(
metric="cosine",
vector_column_name="vector",
index_type="IVF_PQ",
num_partitions=1,
num_sub_vectors=2,
)
# query with single vector
q = [1, 2]
rs = multivec_table.search(q).to_arrow()
# query with multiple vectors
q = [[1, 2], [1, 2]]
rs2 = multivec_table.search(q).to_arrow()
assert len(rs2) == len(rs)
for i in range(2):
assert rs2["_distance"][i].as_py() == rs["_distance"][i].as_py() * 2
# can't query with vector that dim not matched
with pytest.raises(Exception):
multivec_table.search([1, 2, 3]).to_arrow()
# can't query with vector list that some dim not matched
with pytest.raises(Exception):
multivec_table.search([[1, 2], [1, 2, 3]]).to_arrow()
@pytest.mark.asyncio
async def test_multivector_async(multivec_table_async: AsyncTable):
# create index on multivector
await multivec_table_async.create_index(
"vector",
config=IvfPq(distance_type="cosine", num_partitions=1, num_sub_vectors=2),
)
# query with single vector
q = [1, 2]
rs = await multivec_table_async.query().nearest_to(q).to_arrow()
# query with multiple vectors
q = [[1, 2], [1, 2]]
rs2 = await multivec_table_async.query().nearest_to(q).to_arrow()
assert len(rs2) == len(rs)
for i in range(2):
assert rs2["_distance"][i].as_py() == rs["_distance"][i].as_py() * 2
# can't query with vector that dim not matched
with pytest.raises(Exception):
await multivec_table_async.query().nearest_to([1, 2, 3]).to_arrow()
# can't query with vector list that some dim not matched
with pytest.raises(Exception):
await multivec_table_async.query().nearest_to([[1, 2], [1, 2, 3]]).to_arrow()
def test_vector_query_with_no_limit(table):
with pytest.raises(ValueError):
LanceVectorQueryBuilder(table, [0, 0], "vector").limit(0).select(
@@ -448,11 +558,13 @@ async def test_query_to_pandas_flatten_async(table_struct_async: AsyncTable):
@pytest.mark.asyncio
async def test_query_to_polars_async(table_async: AsyncTable):
schema = await table_async.schema()
num_columns = len(schema.names)
df = await table_async.query().to_polars()
assert df.shape == (2, 5)
assert df.shape == (2, num_columns)
df = await table_async.query().where("id < 0").to_polars()
assert df.shape == (0, 5)
assert df.shape == (0, num_columns)
@pytest.mark.asyncio

View File

@@ -3,6 +3,7 @@ import random
import lancedb
import numpy as np
import pyarrow as pa
import pytest
from lancedb.conftest import MockTextEmbeddingFunction # noqa
from lancedb.embeddings import EmbeddingFunctionRegistry
@@ -281,6 +282,31 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_linear_combination(tmp_path, use_tantivy):
reranker = LinearCombinationReranker()
vector_results = pa.Table.from_pydict(
{
"_rowid": [0, 1, 2, 3, 4],
"_distance": [0.1, 0.2, 0.3, 0.4, 0.5],
"_text": ["a", "b", "c", "d", "e"],
}
)
fts_results = pa.Table.from_pydict(
{
"_rowid": [1, 2, 3, 4, 5],
"_score": [0.1, 0.2, 0.3, 0.4, 0.5],
"_text": ["b", "c", "d", "e", "f"],
}
)
combined_results = reranker.merge_results(vector_results, fts_results, 1.0)
assert len(combined_results) == 6
assert "_rowid" in combined_results.column_names
assert "_text" in combined_results.column_names
assert "_distance" not in combined_results.column_names
assert "_score" not in combined_results.column_names
assert "_relevance_score" in combined_results.column_names
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.14.2-beta.0"
version = "0.15.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.14.2-beta.0"
version = "0.15.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -52,7 +52,7 @@ fn create_some_records() -> Result<Box<dyn RecordBatchReader + Send>> {
.iter()
.step_by(1024)
.take(500)
.map(|w| *w)
.copied()
.collect::<Vec<_>>();
let n_terms = 3;
let batches = RecordBatchIterator::new(
@@ -95,7 +95,7 @@ async fn search_index(table: &Table) -> Result<()> {
.iter()
.step_by(1024)
.take(500)
.map(|w| *w)
.copied()
.collect::<Vec<_>>();
let query = words[0].to_owned();
println!("Searching for: {}", query);

View File

@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
use crate::query::Select;
use crate::table::AddDataMode;
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
use crate::{Error, Table};
use crate::{DistanceType, Error, Table};
use arrow_array::RecordBatchReader;
use arrow_ipc::reader::FileReader;
use arrow_schema::{DataType, SchemaRef};
@@ -592,7 +592,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
message: format!("Column {} not found in schema", column),
})?;
if supported_vector_data_type(field.data_type()) {
("IVF_PQ", None)
("IVF_PQ", Some(DistanceType::L2))
} else if supported_btree_data_type(field.data_type()) {
("BTREE", None)
} else {

View File

@@ -18,7 +18,7 @@ use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;
use arrow::array::AsArray;
use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder};
use arrow::datatypes::{Float32Type, UInt8Type};
use arrow_array::{RecordBatchIterator, RecordBatchReader};
use arrow_schema::{DataType, Field, Schema, SchemaRef};
@@ -73,7 +73,7 @@ use crate::query::{
IntoQueryVector, Query, QueryExecutionOptions, Select, VectorQuery, DEFAULT_TOP_K,
};
use crate::utils::{
default_vector_column, supported_bitmap_data_type, supported_btree_data_type,
default_vector_column, infer_vector_dim, supported_bitmap_data_type, supported_btree_data_type,
supported_fts_data_type, supported_label_list_data_type, supported_vector_data_type,
PatchReadParam, PatchWriteParam,
};
@@ -1370,14 +1370,8 @@ impl NativeTable {
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
n
} else {
match field.data_type() {
arrow_schema::DataType::FixedSizeList(_, n) => {
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
}
_ => Err(Error::Schema {
message: format!("Column '{}' is not a FixedSizeList", field.name()),
}),
}?
let dim = infer_vector_dim(field.data_type())?;
suggested_num_sub_vectors(dim as u32)
};
let mut dataset = self.dataset.get_mut().await?;
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
@@ -1902,68 +1896,74 @@ impl TableInternal for NativeTable {
options: QueryExecutionOptions,
) -> Result<Arc<dyn ExecutionPlan>> {
let ds_ref = self.dataset.get().await?;
let mut column = query.column.clone();
let schema = ds_ref.schema();
let mut query_vector = query.query_vector.first().cloned();
if query.query_vector.len() > 1 {
// If there are multiple query vectors, create a plan for each of them and union them.
let query_vecs = query.query_vector.clone();
let plan_futures = query_vecs
.into_iter()
.map(|query_vector| {
let mut sub_query = query.clone();
sub_query.query_vector = vec![query_vector];
let options_ref = options.clone();
async move { self.create_plan(&sub_query, options_ref).await }
})
.collect::<Vec<_>>();
let plans = futures::future::try_join_all(plan_futures).await?;
return Table::multi_vector_plan(plans);
if column.is_none() {
// Infer a vector column with the same dimension of the query vector.
let arrow_schema = Schema::from(ds_ref.schema());
column = Some(default_vector_column(
&arrow_schema,
Some(query.query_vector[0].len() as i32),
)?);
}
let vector_field = schema.field(column.as_ref().unwrap()).unwrap();
if let DataType::List(_) = vector_field.data_type() {
// it's multivector, then the vectors should be treated as single query
// concatenate the vectors into a FixedSizeList<FixedSizeList<_>>
// it's also possible to concatenate the vectors into a List<FixedSizeList<_>>,
// but FixedSizeList is more efficient and easier to construct
let vectors = query
.query_vector
.iter()
.map(|arr| arr.as_ref())
.collect::<Vec<_>>();
let dim = vectors[0].len();
let mut fsl_builder = FixedSizeListBuilder::with_capacity(
Float32Builder::with_capacity(dim),
dim as i32,
vectors.len(),
);
for vec in vectors {
fsl_builder
.values()
.append_slice(vec.as_primitive::<Float32Type>().values());
fsl_builder.append(true);
}
query_vector = Some(Arc::new(fsl_builder.finish()));
} else {
// If there are multiple query vectors, create a plan for each of them and union them.
let query_vecs = query.query_vector.clone();
let plan_futures = query_vecs
.into_iter()
.map(|query_vector| {
let mut sub_query = query.clone();
sub_query.query_vector = vec![query_vector];
let options_ref = options.clone();
async move { self.create_plan(&sub_query, options_ref).await }
})
.collect::<Vec<_>>();
let plans = futures::future::try_join_all(plan_futures).await?;
return Table::multi_vector_plan(plans);
}
}
let mut scanner: Scanner = ds_ref.scan();
if let Some(query_vector) = query.query_vector.first() {
if let Some(query_vector) = query_vector {
// If there is a vector query, default to limit=10 if unspecified
let column = if let Some(col) = query.column.as_ref() {
col.clone()
let column = if let Some(col) = column {
col
} else {
// Infer a vector column with the same dimension of the query vector.
let arrow_schema = Schema::from(ds_ref.schema());
default_vector_column(&arrow_schema, Some(query_vector.len() as i32))?
};
let field = ds_ref.schema().field(&column).ok_or(Error::Schema {
message: format!("Column {} not found in dataset schema", column),
})?;
let mut is_binary = false;
if let arrow_schema::DataType::FixedSizeList(element, dim) = field.data_type() {
match element.data_type() {
e_type if e_type.is_floating() => {}
e_type if *e_type == DataType::UInt8 => {
is_binary = true;
}
_ => {
return Err(Error::InvalidInput {
message: format!(
"The data type of the vector column '{}' is not a floating point type",
column
),
});
}
}
if dim != query_vector.len() as i32 {
return Err(Error::InvalidInput {
message: format!(
"The dimension of the query vector does not match with the dimension of the vector column '{}': \
query dim={}, expected vector dim={}",
column,
query_vector.len(),
dim,
),
});
}
}
let (_, element_type) = lance::index::vector::utils::get_vector_type(schema, &column)?;
let is_binary = matches!(element_type, DataType::UInt8);
if is_binary {
let query_vector = arrow::compute::cast(&query_vector, &DataType::UInt8)?;
let query_vector = query_vector.as_primitive::<UInt8Type>();
@@ -1973,10 +1973,9 @@ impl TableInternal for NativeTable {
query.base.limit.unwrap_or(DEFAULT_TOP_K),
)?;
} else {
let query_vector = query_vector.as_primitive::<Float32Type>();
scanner.nearest(
&column,
query_vector,
query_vector.as_ref(),
query.base.limit.unwrap_or(DEFAULT_TOP_K),
)?;
}

View File

@@ -108,13 +108,8 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
let candidates = schema
.fields()
.iter()
.filter_map(|field| match field.data_type() {
arrow_schema::DataType::FixedSizeList(f, d)
if (f.data_type().is_floating() || f.data_type() == &DataType::UInt8)
&& dim.map(|expect| *d == expect).unwrap_or(true) =>
{
Some(field.name())
}
.filter_map(|field| match inf_vector_dim(field) {
Some(d) if dim.is_none() || dim == Some(d) => Some(field.name()),
_ => None,
})
.collect::<Vec<_>>();
@@ -138,6 +133,20 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
}
}
fn inf_vector_dim(field: &arrow_schema::Field) -> Option<i32> {
match field.data_type() {
arrow_schema::DataType::FixedSizeList(f, d) => {
if f.data_type().is_floating() || f.data_type() == &DataType::UInt8 {
Some(*d)
} else {
None
}
}
arrow_schema::DataType::List(f) => inf_vector_dim(f),
_ => None,
}
}
pub fn supported_btree_data_type(dtype: &DataType) -> bool {
dtype.is_integer()
|| dtype.is_floating()
@@ -171,13 +180,32 @@ pub fn supported_fts_data_type(dtype: &DataType) -> bool {
pub fn supported_vector_data_type(dtype: &DataType) -> bool {
match dtype {
DataType::FixedSizeList(inner, _) => {
DataType::is_floating(inner.data_type()) || *inner.data_type() == DataType::UInt8
DataType::FixedSizeList(field, _) => {
field.data_type().is_floating() || field.data_type() == &DataType::UInt8
}
DataType::List(field) => supported_vector_data_type(field.data_type()),
_ => false,
}
}
// TODO: remove this after we expose the same function in Lance.
pub fn infer_vector_dim(data_type: &DataType) -> Result<usize> {
infer_vector_dim_impl(data_type, false)
}
fn infer_vector_dim_impl(data_type: &DataType, in_list: bool) -> Result<usize> {
match (data_type, in_list) {
(DataType::FixedSizeList(_, dim), _) => Ok(*dim as usize),
(DataType::List(inner), false) => infer_vector_dim_impl(inner.data_type(), true),
_ => Err(Error::InvalidInput {
message: format!(
"data type is not a vector (FixedSizeList or List<FixedSizeList>), but {:?}",
data_type
),
}),
}
}
/// Note: this is temporary until we get a proper datatype conversion in Lance.
pub fn string_to_datatype(s: &str) -> Option<DataType> {
let data_type = serde_json::Value::String(s.to_string());