Compare commits

...

13 Commits

Author SHA1 Message Date
Lance Release
995bd9bf37 Bump version: 0.18.0-beta.1 → 0.18.0 2025-01-14 01:02:26 +00:00
Lance Release
36cc06697f Bump version: 0.18.0-beta.0 → 0.18.0-beta.1 2025-01-14 01:02:25 +00:00
Will Jones
35da464591 ci: fix stable check (#2019) 2025-01-13 17:01:54 -08:00
Will Jones
31f9c30ffb chore: fix test of error message (#2018)
Addresses failure on `main`:
https://github.com/lancedb/lancedb/actions/runs/12757756657/job/35558683317
2025-01-13 15:36:46 -08:00
Will Jones
92dcf24b0c feat: upgrade Lance to v0.22.0 (#2017)
Upstream changelog:
https://github.com/lancedb/lance/releases/tag/v0.22.0
2025-01-13 15:06:01 -08:00
Will Jones
6b0adba2d9 chore: add deprecation warning to vectordb (#2003) 2025-01-13 14:53:12 -08:00
BubbleCal
66cbf6b6c5 feat: support multivector type (#2005)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-01-13 14:10:40 -08:00
Keming
ce9506db71 docs(hnsw): fix markdown list style (#2015) 2025-01-13 08:53:13 -08:00
Prashant Dixit
b66cd943a7 fix: broken voyageai embedding API (#2013)
This PR fixes the broken Embedding API for Voyageai.
2025-01-13 08:52:38 -08:00
Weston Pace
d8d11f48e7 feat: upgrade to lance 0.22.0b1 (#2011) 2025-01-10 12:51:52 -08:00
Lance Release
7ec5df3022 Updating package-lock.json 2025-01-10 19:58:10 +00:00
Lance Release
b17304172c Updating package-lock.json 2025-01-10 19:02:31 +00:00
Lance Release
fbe5408434 Updating package-lock.json 2025-01-10 19:02:15 +00:00
21 changed files with 341 additions and 280 deletions

View File

@@ -21,16 +21,14 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.21.1", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-io = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-index = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-linalg = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-table = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-testing = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-datafusion = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance-encoding = { version = "=0.21.1", git = "https://github.com/lancedb/lance.git", tag = "v0.21.1-beta.2" }
lance = { "version" = "=0.22.0", "features" = ["dynamodb"] }
lance-io = "=0.22.0"
lance-index = "=0.22.0"
lance-linalg = "=0.22.0"
lance-table = "=0.22.0"
lance-testing = "=0.22.0"
lance-datafusion = "=0.22.0"
lance-encoding = "=0.22.0"
# Note that this one does not include pyarrow
arrow = { version = "53.2", optional = false }
arrow-array = "53.2"
@@ -42,8 +40,8 @@ arrow-arith = "53.2"
arrow-cast = "53.2"
async-trait = "0"
chrono = "0.4.35"
datafusion-common = "42.0"
datafusion-physical-plan = "42.0"
datafusion-common = "44.0"
datafusion-physical-plan = "44.0"
env_logger = "0.10"
half = { "version" = "=2.4.1", default-features = false, features = [
"num-traits",

View File

@@ -12,7 +12,7 @@ with open("Cargo.toml", "rb") as f:
elif isinstance(dep, dict):
# Version doesn't have the beta tag in it, so we instead look
# at the git tag.
version = dep["tag"]
version = dep.get('tag', dep.get('version'))
else:
raise ValueError("Unexpected type for dependency: " + str(dep))

BIN
docs/src/assets/maxsim.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -7,7 +7,7 @@ Approximate Nearest Neighbor (ANN) search is a method for finding data points ne
There are three main types of ANN search algorithms:
* **Tree-based search algorithms**: Use a tree structure to organize and store data points.
* * **Hash-based search algorithms**: Use a specialized geometric hash table to store and manage data points. These algorithms typically focus on theoretical guarantees, and don't usually perform as well as the other approaches in practice.
* **Hash-based search algorithms**: Use a specialized geometric hash table to store and manage data points. These algorithms typically focus on theoretical guarantees, and don't usually perform as well as the other approaches in practice.
* **Graph-based search algorithms**: Use a graph structure to store data points, which can be a bit complex.
HNSW is a graph-based algorithm. All graph-based search algorithms rely on the idea of a k-nearest neighbor (or k-approximate nearest neighbor) graph, which we outline below.

View File

@@ -138,6 +138,36 @@ LanceDB supports binary vectors as a data type, and has the ability to search bi
--8<-- "python/python/tests/docs/test_binary_vector.py:async_binary_vector"
```
## Multivector type
LanceDB supports multivector type, this is useful when you have multiple vectors for a single item (e.g. with ColBert and ColPali).
You can index on a column with multivector type and search on it, the query can be single vector or multiple vectors. If the query is multiple vectors `mq`, the similarity (distance) from it to any multivector `mv` in the dataset, is defined as:
![maxsim](assets/maxsim.png)
where `sim` is the similarity function (e.g. cosine).
For now, only `cosine` metric is supported for multivector search.
=== "Python"
=== "sync API"
```python
--8<-- "python/python/tests/docs/test_multivector.py:imports"
--8<-- "python/python/tests/docs/test_multivector.py:sync_multivector"
```
=== "async API"
```python
--8<-- "python/python/tests/docs/test_multivector.py:imports"
--8<-- "python/python/tests/docs/test_multivector.py:async_multivector"
```
## Search with distance range
You can also search for vectors within a specific distance range from the query vector. This is useful when you want to find vectors that are not just the nearest neighbors, but also those that are within a certain distance. This can be done by using the `distance_range` method.

View File

@@ -18,7 +18,7 @@ import numpy as np
uri = "data/sample-lancedb"
data = [{"vector": row, "item": f"item {i}", "id": i}
for i, row in enumerate(np.random.random((10_000, 2)).astype('int'))]
for i, row in enumerate(np.random.random((10_000, 2)))]
# Synchronous client
db = lancedb.connect(uri)

View File

@@ -2,6 +2,9 @@
A JavaScript / Node.js library for [LanceDB](https://github.com/lancedb/lancedb).
**DEPRECATED: This library is deprecated. Please use the new client,
[@lancedb/lancedb](https://www.npmjs.com/package/@lancedb/lancedb).**
## Installation
```bash

75
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.14.2-beta.0",
"version": "0.15.0-beta.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.14.2-beta.0",
"version": "0.15.0-beta.0",
"cpu": [
"x64",
"arm64"
@@ -52,14 +52,14 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.14.2-beta.0",
"@lancedb/vectordb-darwin-x64": "0.14.2-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.14.2-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.14.2-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.14.2-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.14.2-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.14.2-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.14.2-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
"@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -330,9 +330,9 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.14.2-beta.0.tgz",
"integrity": "sha512-nsXOl9M8jhsr/LrfvrVHiuWWj/zX3zU2Aahpw8etjJbnU83nmO1r9agPxN6mD/J60EsLP3gDaiRPaFY66pHScA==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0-beta.0.tgz",
"integrity": "sha512-4sPAW4p1YFVfURyf0k017l6LRCz+VmN9fVUBy7W27b6EOQ3xuIb3t5xq3JAtslMPWBP3wxP8rKXXDmlbqDg3+g==",
"cpu": [
"arm64"
],
@@ -343,9 +343,9 @@
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.14.2-beta.0.tgz",
"integrity": "sha512-E1ouo0EfGaxG26YWnw717vaHGNLulmqzh6eaTQuj45Vd4GaPj07TJygtDyvMFBJdsZjdY5YIc9U8yIem1NfeKQ==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0-beta.0.tgz",
"integrity": "sha512-uzGINrBBsZattB4/ZYxdGNkTxNh3MqE6Y4nF762qo0zWWSiu+QNHQ+ZyLAZ2lwrEvwxs8LUaJNmnpn3nocHc1A==",
"cpu": [
"x64"
],
@@ -356,9 +356,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.14.2-beta.0.tgz",
"integrity": "sha512-SewXZLGccZUkONACHHPCW1Z7xsz8MaXifwpaWMEyIzbQBFAIMq30lPZN63bTt/zNo6BcBPv54yz6n1ZfCv5l+w==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0-beta.0.tgz",
"integrity": "sha512-bgphfea8h65vJ+bAL+vb+XEfmjskLZ+trZ3GN4n6SICU7XMGSFPl9xzPLGAj1WsoFCTJHe87DRYQpsWGlOI/LQ==",
"cpu": [
"arm64"
],
@@ -369,9 +369,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-arm64-musl": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.14.2-beta.0.tgz",
"integrity": "sha512-ppq3P2QYxPHmikY6nbWTwMhDGP+e+feqzm4iXKhpBxzHR2XwoY5CtDKgKDfEHy1FyCoIyvh2yYT2M1TSkrkOBw==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0-beta.0.tgz",
"integrity": "sha512-GpmVgqMS9ztNX53z8v0JdZiG6K1cK+mJnGZd3Gzguiavrly4mkYZ8IKNwWP9RmewUMNsFWR0IzD4VR+ojVpjlQ==",
"cpu": [
"arm64"
],
@@ -382,9 +382,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.14.2-beta.0.tgz",
"integrity": "sha512-XgkoarmdS42fLMMqNdHTVja2z7a0/Q4h3X+n14Ph/pkYsb7pmOabV4a7+ej8KJPm1wv2GmDA4GXcFPjF0tFBFA==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0-beta.0.tgz",
"integrity": "sha512-6Y/39TDv4UDVWnl8UpUJ8mqv9rUNc9Q5VR510I7w34c0ChdWvjqdcy+JFnGrraamE1DA8E6wGEz+5oG0zprkNg==",
"cpu": [
"x64"
],
@@ -395,9 +395,9 @@
]
},
"node_modules/@lancedb/vectordb-linux-x64-musl": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.14.2-beta.0.tgz",
"integrity": "sha512-vGgUOVb43eccF0oz2YJK+Zionwk4ODelHU7icmGeVsULkkFkoAbf0nO4PY38ZAeLsodnLxHIIu51Bd4Jm9m20w==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0-beta.0.tgz",
"integrity": "sha512-GRdW2dhf6DmynhRojjtQjs8DeARM1WpbZZKXukeofOSMv6JoRBSWKw2DzW5sF/285IMU81B0OXZE75QjLp+VJg==",
"cpu": [
"x64"
],
@@ -407,10 +407,23 @@
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-arm64-msvc": {
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0-beta.0.tgz",
"integrity": "sha512-2EmRHuqqj8kC5ArUZztUWWTfNd774zL68btOlyhYL1CAiet5jIeGuFWJifdh+PXfQeLoa4GLW5LwyudIR4IHwA==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"win32"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.14.2-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.14.2-beta.0.tgz",
"integrity": "sha512-zGLC382V3gE1MHQpf0XTe34yiB+6ZtSIuOFMIDEZVI5PVN5XkXULMY6dlt5fvo4IxhRoscGjpmmaNxJzUwigDg==",
"version": "0.15.0-beta.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0-beta.0.tgz",
"integrity": "sha512-lWq9b7LnWMGO0zDsp3rsLYyAzLooV7zQP77ph9Qv9fF0e4egD5l6SmMsAdQqLQnlhbQjkRjt3XRoDsqI809fcw==",
"cpu": [
"x64"
],

View File

@@ -767,7 +767,9 @@ describe("When creating an index", () => {
)
.column("vec")
.toArrow(),
).rejects.toThrow(/.* query dim=64, expected vector dim=32.*/);
).rejects.toThrow(
/.* query dim\(64\) doesn't match the column vec vector dim\(32\).*/,
);
const query64 = Array(64)
.fill(1)

151
nodejs/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.14.2-beta.0",
"version": "0.15.0-beta.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.14.2-beta.0",
"version": "0.15.0-beta.0",
"cpu": [
"x64",
"arm64"
@@ -18,7 +18,6 @@
"win32"
],
"dependencies": {
"@lancedb/lancedb": "^0.14.1",
"reflect-metadata": "^0.2.2"
},
"devDependencies": {
@@ -4150,152 +4149,6 @@
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@lancedb/lancedb": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb/-/lancedb-0.14.1.tgz",
"integrity": "sha512-DfJ887t52n/2s8G1JnzE7gAR4i7UnfP1OjDYnJ4yTk0aIcn76CbVOUegYfURYlYjL+QFdI1MrAzUdMgYgsGGcA==",
"cpu": [
"x64",
"arm64"
],
"license": "Apache 2.0",
"os": [
"darwin",
"linux",
"win32"
],
"dependencies": {
"reflect-metadata": "^0.2.2"
},
"engines": {
"node": ">= 18"
},
"optionalDependencies": {
"@lancedb/lancedb-darwin-arm64": "0.14.1",
"@lancedb/lancedb-darwin-x64": "0.14.1",
"@lancedb/lancedb-linux-arm64-gnu": "0.14.1",
"@lancedb/lancedb-linux-arm64-musl": "0.14.1",
"@lancedb/lancedb-linux-x64-gnu": "0.14.1",
"@lancedb/lancedb-linux-x64-musl": "0.14.1",
"@lancedb/lancedb-win32-arm64-msvc": "0.14.1",
"@lancedb/lancedb-win32-x64-msvc": "0.14.1"
},
"peerDependencies": {
"apache-arrow": ">=15.0.0 <=18.1.0"
}
},
"node_modules/@lancedb/lancedb-darwin-arm64": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-arm64/-/lancedb-darwin-arm64-0.14.1.tgz",
"integrity": "sha512-eSWV3GydXfyaptPXZ+S3BgXY1YI26oHQDekACaVevRW6/YQD7sS9UhhSZn1mYyDtLTfJu2kOK2XHA9UY8nyuTg==",
"cpu": [
"arm64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-darwin-x64": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-darwin-x64/-/lancedb-darwin-x64-0.14.1.tgz",
"integrity": "sha512-ecf50ykF9WCWmpwAjs3Mk2mph7d+rMJ9EVJeX0UJ4KHDC874lnTDo6Tfd9iUcbExtNI1KZbu+CFnYsbQU+R0gw==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-arm64-gnu": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.14.1.tgz",
"integrity": "sha512-X7ub1fOm7jZ19KFW/u3nDyFvj5XzDPqEVrp9mmcOgSrst3NJEGGBz1JypkLnTWpg/7IpCBs1UO1G7R7LEsHYOA==",
"cpu": [
"arm64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-arm64-musl": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-musl/-/lancedb-linux-arm64-musl-0.14.1.tgz",
"integrity": "sha512-rkiWpsQCXwybwEjcdFXkAeGahiLcK/NQUjZc9WBY6CKk2Y9dICIafYzxZ6MDCY19jeJIgs3JS0mjleUWYr3JFw==",
"cpu": [
"arm64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-x64-gnu": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-gnu/-/lancedb-linux-x64-gnu-0.14.1.tgz",
"integrity": "sha512-LGp4D58pQJ3+H3GncNxWHkvhIVOKpTzYUBtVfC8he1rwZ6+CiYDyK9Sim/j8o3UJlJ7cP0m3gNUzPfQchQF9WA==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-linux-x64-musl": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-x64-musl/-/lancedb-linux-x64-musl-0.14.1.tgz",
"integrity": "sha512-V/TeoyKUESPL/8L1z4WLbMFe5ZEv4gtxc0AFK8ghiduFYN/Hckuj4oTo/Y0ysLiBx1At9FCa91hWDB301ibHBg==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@lancedb/lancedb-win32-x64-msvc": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@lancedb/lancedb-win32-x64-msvc/-/lancedb-win32-x64-msvc-0.14.1.tgz",
"integrity": "sha512-4M8D0j8/3WZv4CKo+Z44sISKPCKWN5MWA0dcEEGw4sEXHF2RJLrMIOOgEpT5NF7VW+X4t2JJxUA6j2T3cXaD8w==",
"cpu": [
"x64"
],
"license": "Apache 2.0",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">= 18"
}
},
"node_modules/@napi-rs/cli": {
"version": "2.18.3",
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-2.18.3.tgz",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.18.0-beta.0"
current_version = "0.18.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.18.0-beta.0"
version = "0.18.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -4,7 +4,7 @@ name = "lancedb"
dynamic = ["version"]
dependencies = [
"deprecation",
"pylance==0.21.1b1",
"pylance==0.22.0",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",
@@ -53,7 +53,7 @@ tests = [
"pytz",
"polars>=0.19, <=1.3.0",
"tantivy",
"pyarrow-stubs"
"pyarrow-stubs",
]
dev = ["ruff", "pre-commit", "pyright"]
docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]

View File

@@ -59,7 +59,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
.create(name="voyage-3")
class TextModel(LanceModel):
data: str = voyageai.SourceField()
text: str = voyageai.SourceField()
vector: Vector(voyageai.ndims()) = voyageai.VectorField()
data = [ { "text": "hello world" },
@@ -74,6 +74,14 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
name: str
client: ClassVar = None
text_embedding_models: list = [
"voyage-3",
"voyage-3-lite",
"voyage-finance-2",
"voyage-law-2",
"voyage-code-2",
]
multimodal_embedding_models: list = ["voyage-multimodal-3"]
def ndims(self):
if self.name == "voyage-3-lite":
@@ -115,13 +123,14 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
truncation: Optional[bool]
"""
if self.name in ["voyage-multimodal-3"]:
rs = VoyageAIEmbeddingFunction._get_client().multimodal_embed(
inputs=[[text]], model=self.name, **kwargs
)
client = VoyageAIEmbeddingFunction._get_client()
if self.name in self.text_embedding_models:
rs = client.embed(texts=[text], model=self.name, **kwargs)
elif self.name in self.multimodal_embedding_models:
rs = client.multimodal_embed(inputs=[[text]], model=self.name, **kwargs)
else:
rs = VoyageAIEmbeddingFunction._get_client().embed(
texts=[text], model=self.name, **kwargs
raise ValueError(
f"Model {self.name} not supported to generate text embeddings"
)
return rs.embeddings[0]

View File

@@ -1741,12 +1741,14 @@ class AsyncQuery(AsyncQueryBase):
a default `limit` of 10 will be used.
Typically, a single vector is passed in as the query. However, you can also
pass in multiple vectors. This can be useful if you want to find the nearest
vectors to multiple query vectors. This is not expected to be faster than
making multiple queries concurrently; it is just a convenience method.
If multiple vectors are passed in then an additional column `query_index`
will be added to the results. This column will contain the index of the
query vector that the result is nearest to.
pass in multiple vectors. When multiple vectors are passed in, if the vector
column is with multivector type, then the vectors will be treated as a single
query. Or the vectors will be treated as multiple queries, this can be useful
if you want to find the nearest vectors to multiple query vectors.
This is not expected to be faster than making multiple queries concurrently;
it is just a convenience method. If multiple vectors are passed in then
an additional column `query_index` will be added to the results. This column
will contain the index of the query vector that the result is nearest to.
"""
if query_vector is None:
raise ValueError("query_vector can not be None")

View File

@@ -2856,6 +2856,8 @@ class AsyncTable:
async_query = async_query.with_row_id()
if query.vector:
# we need the schema to get the vector column type
# to determine whether the vectors is batch queries or not
async_query = (
async_query.nearest_to(query.vector)
.distance_type(query.metric)

View File

@@ -223,7 +223,7 @@ def inf_vector_column_query(schema: pa.Schema) -> str:
vector_col_count = 0
for field_name in schema.names:
field = schema.field(field_name)
if pa.types.is_fixed_size_list(field.type):
if is_vector_column(field.type):
vector_col_count += 1
if vector_col_count > 1:
raise ValueError(
@@ -231,7 +231,6 @@ def inf_vector_column_query(schema: pa.Schema) -> str:
"Please specify the vector column name "
"for vector search"
)
break
elif vector_col_count == 1:
vector_col_name = field_name
if vector_col_count == 0:
@@ -242,6 +241,29 @@ def inf_vector_column_query(schema: pa.Schema) -> str:
return vector_col_name
def is_vector_column(data_type: pa.DataType) -> bool:
"""
Check if the column is a vector column.
Parameters
----------
data_type : pa.DataType
The data type of the column.
Returns
-------
bool: True if the column is a vector column.
"""
if pa.types.is_fixed_size_list(data_type) and (
pa.types.is_floating(data_type.value_type)
or pa.types.is_uint8(data_type.value_type)
):
return True
elif pa.types.is_list(data_type):
return is_vector_column(data_type.value_type)
return False
def infer_vector_column_name(
schema: pa.Schema,
query_type: str,

View File

@@ -68,6 +68,60 @@ async def table_struct_async(tmp_path) -> AsyncTable:
return await conn.create_table("test_struct", table)
@pytest.fixture
def multivec_table() -> lancedb.table.Table:
db = lancedb.connect("memory://")
# Generate 256 rows of data
num_rows = 256
# Generate data for each column
vector_data = [
[[i, i + 1], [i + 2, i + 3]] for i in range(num_rows)
] # Adjust to match nested structure
id_data = list(range(1, num_rows + 1))
float_field_data = [float(i) for i in range(1, num_rows + 1)]
# Create the Arrow table
df = pa.table(
{
"vector": pa.array(
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
),
"id": pa.array(id_data),
"float_field": pa.array(float_field_data),
}
)
return db.create_table("test", df)
@pytest_asyncio.fixture
async def multivec_table_async(tmp_path) -> AsyncTable:
conn = await lancedb.connect_async(
"memory://", read_consistency_interval=timedelta(seconds=0)
)
# Generate 256 rows of data
num_rows = 256
# Generate data for each column
vector_data = [
[[i, i + 1], [i + 2, i + 3]] for i in range(num_rows)
] # Adjust to match nested structure
id_data = list(range(1, num_rows + 1))
float_field_data = [float(i) for i in range(1, num_rows + 1)]
# Create the Arrow table
df = pa.table(
{
"vector": pa.array(
vector_data, type=pa.list_(pa.list_(pa.float32(), list_size=2))
),
"id": pa.array(id_data),
"float_field": pa.array(float_field_data),
}
)
return await conn.create_table("test_async", df)
def test_cast(table):
class TestModel(LanceModel):
vector: Vector(2)
@@ -177,6 +231,62 @@ async def test_distance_range_async(table_async: AsyncTable):
assert res["_distance"].to_pylist() == [min_dist, max_dist]
def test_multivector(multivec_table: lancedb.table.Table):
# create index on multivector
multivec_table.create_index(
metric="cosine",
vector_column_name="vector",
index_type="IVF_PQ",
num_partitions=1,
num_sub_vectors=2,
)
# query with single vector
q = [1, 2]
rs = multivec_table.search(q).to_arrow()
# query with multiple vectors
q = [[1, 2], [1, 2]]
rs2 = multivec_table.search(q).to_arrow()
assert len(rs2) == len(rs)
for i in range(2):
assert rs2["_distance"][i].as_py() == rs["_distance"][i].as_py() * 2
# can't query with vector that dim not matched
with pytest.raises(Exception):
multivec_table.search([1, 2, 3]).to_arrow()
# can't query with vector list that some dim not matched
with pytest.raises(Exception):
multivec_table.search([[1, 2], [1, 2, 3]]).to_arrow()
@pytest.mark.asyncio
async def test_multivector_async(multivec_table_async: AsyncTable):
# create index on multivector
await multivec_table_async.create_index(
"vector",
config=IvfPq(distance_type="cosine", num_partitions=1, num_sub_vectors=2),
)
# query with single vector
q = [1, 2]
rs = await multivec_table_async.query().nearest_to(q).to_arrow()
# query with multiple vectors
q = [[1, 2], [1, 2]]
rs2 = await multivec_table_async.query().nearest_to(q).to_arrow()
assert len(rs2) == len(rs)
for i in range(2):
assert rs2["_distance"][i].as_py() == rs["_distance"][i].as_py() * 2
# can't query with vector that dim not matched
with pytest.raises(Exception):
await multivec_table_async.query().nearest_to([1, 2, 3]).to_arrow()
# can't query with vector list that some dim not matched
with pytest.raises(Exception):
await multivec_table_async.query().nearest_to([[1, 2], [1, 2, 3]]).to_arrow()
def test_vector_query_with_no_limit(table):
with pytest.raises(ValueError):
LanceVectorQueryBuilder(table, [0, 0], "vector").limit(0).select(
@@ -448,11 +558,13 @@ async def test_query_to_pandas_flatten_async(table_struct_async: AsyncTable):
@pytest.mark.asyncio
async def test_query_to_polars_async(table_async: AsyncTable):
schema = await table_async.schema()
num_columns = len(schema.names)
df = await table_async.query().to_polars()
assert df.shape == (2, 5)
assert df.shape == (2, num_columns)
df = await table_async.query().where("id < 0").to_polars()
assert df.shape == (0, 5)
assert df.shape == (0, num_columns)
@pytest.mark.asyncio

View File

@@ -52,7 +52,7 @@ fn create_some_records() -> Result<Box<dyn RecordBatchReader + Send>> {
.iter()
.step_by(1024)
.take(500)
.map(|w| *w)
.copied()
.collect::<Vec<_>>();
let n_terms = 3;
let batches = RecordBatchIterator::new(
@@ -95,7 +95,7 @@ async fn search_index(table: &Table) -> Result<()> {
.iter()
.step_by(1024)
.take(500)
.map(|w| *w)
.copied()
.collect::<Vec<_>>();
let query = words[0].to_owned();
println!("Searching for: {}", query);

View File

@@ -18,7 +18,7 @@ use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;
use arrow::array::AsArray;
use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder};
use arrow::datatypes::{Float32Type, UInt8Type};
use arrow_array::{RecordBatchIterator, RecordBatchReader};
use arrow_schema::{DataType, Field, Schema, SchemaRef};
@@ -1902,68 +1902,74 @@ impl TableInternal for NativeTable {
options: QueryExecutionOptions,
) -> Result<Arc<dyn ExecutionPlan>> {
let ds_ref = self.dataset.get().await?;
let mut column = query.column.clone();
let schema = ds_ref.schema();
let mut query_vector = query.query_vector.first().cloned();
if query.query_vector.len() > 1 {
// If there are multiple query vectors, create a plan for each of them and union them.
let query_vecs = query.query_vector.clone();
let plan_futures = query_vecs
.into_iter()
.map(|query_vector| {
let mut sub_query = query.clone();
sub_query.query_vector = vec![query_vector];
let options_ref = options.clone();
async move { self.create_plan(&sub_query, options_ref).await }
})
.collect::<Vec<_>>();
let plans = futures::future::try_join_all(plan_futures).await?;
return Table::multi_vector_plan(plans);
if column.is_none() {
// Infer a vector column with the same dimension of the query vector.
let arrow_schema = Schema::from(ds_ref.schema());
column = Some(default_vector_column(
&arrow_schema,
Some(query.query_vector[0].len() as i32),
)?);
}
let vector_field = schema.field(column.as_ref().unwrap()).unwrap();
if let DataType::List(_) = vector_field.data_type() {
// it's multivector, then the vectors should be treated as single query
// concatenate the vectors into a FixedSizeList<FixedSizeList<_>>
// it's also possible to concatenate the vectors into a List<FixedSizeList<_>>,
// but FixedSizeList is more efficient and easier to construct
let vectors = query
.query_vector
.iter()
.map(|arr| arr.as_ref())
.collect::<Vec<_>>();
let dim = vectors[0].len();
let mut fsl_builder = FixedSizeListBuilder::with_capacity(
Float32Builder::with_capacity(dim),
dim as i32,
vectors.len(),
);
for vec in vectors {
fsl_builder
.values()
.append_slice(vec.as_primitive::<Float32Type>().values());
fsl_builder.append(true);
}
query_vector = Some(Arc::new(fsl_builder.finish()));
} else {
// If there are multiple query vectors, create a plan for each of them and union them.
let query_vecs = query.query_vector.clone();
let plan_futures = query_vecs
.into_iter()
.map(|query_vector| {
let mut sub_query = query.clone();
sub_query.query_vector = vec![query_vector];
let options_ref = options.clone();
async move { self.create_plan(&sub_query, options_ref).await }
})
.collect::<Vec<_>>();
let plans = futures::future::try_join_all(plan_futures).await?;
return Table::multi_vector_plan(plans);
}
}
let mut scanner: Scanner = ds_ref.scan();
if let Some(query_vector) = query.query_vector.first() {
if let Some(query_vector) = query_vector {
// If there is a vector query, default to limit=10 if unspecified
let column = if let Some(col) = query.column.as_ref() {
col.clone()
let column = if let Some(col) = column {
col
} else {
// Infer a vector column with the same dimension of the query vector.
let arrow_schema = Schema::from(ds_ref.schema());
default_vector_column(&arrow_schema, Some(query_vector.len() as i32))?
};
let field = ds_ref.schema().field(&column).ok_or(Error::Schema {
message: format!("Column {} not found in dataset schema", column),
})?;
let mut is_binary = false;
if let arrow_schema::DataType::FixedSizeList(element, dim) = field.data_type() {
match element.data_type() {
e_type if e_type.is_floating() => {}
e_type if *e_type == DataType::UInt8 => {
is_binary = true;
}
_ => {
return Err(Error::InvalidInput {
message: format!(
"The data type of the vector column '{}' is not a floating point type",
column
),
});
}
}
if dim != query_vector.len() as i32 {
return Err(Error::InvalidInput {
message: format!(
"The dimension of the query vector does not match with the dimension of the vector column '{}': \
query dim={}, expected vector dim={}",
column,
query_vector.len(),
dim,
),
});
}
}
let (_, element_type) = lance::index::vector::utils::get_vector_type(schema, &column)?;
let is_binary = matches!(element_type, DataType::UInt8);
if is_binary {
let query_vector = arrow::compute::cast(&query_vector, &DataType::UInt8)?;
let query_vector = query_vector.as_primitive::<UInt8Type>();
@@ -1973,10 +1979,9 @@ impl TableInternal for NativeTable {
query.base.limit.unwrap_or(DEFAULT_TOP_K),
)?;
} else {
let query_vector = query_vector.as_primitive::<Float32Type>();
scanner.nearest(
&column,
query_vector,
query_vector.as_ref(),
query.base.limit.unwrap_or(DEFAULT_TOP_K),
)?;
}

View File

@@ -108,13 +108,8 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
let candidates = schema
.fields()
.iter()
.filter_map(|field| match field.data_type() {
arrow_schema::DataType::FixedSizeList(f, d)
if (f.data_type().is_floating() || f.data_type() == &DataType::UInt8)
&& dim.map(|expect| *d == expect).unwrap_or(true) =>
{
Some(field.name())
}
.filter_map(|field| match inf_vector_dim(field) {
Some(d) if dim.is_none() || dim == Some(d) => Some(field.name()),
_ => None,
})
.collect::<Vec<_>>();
@@ -138,6 +133,20 @@ pub(crate) fn default_vector_column(schema: &Schema, dim: Option<i32>) -> Result
}
}
fn inf_vector_dim(field: &arrow_schema::Field) -> Option<i32> {
match field.data_type() {
arrow_schema::DataType::FixedSizeList(f, d) => {
if f.data_type().is_floating() || f.data_type() == &DataType::UInt8 {
Some(*d)
} else {
None
}
}
arrow_schema::DataType::List(f) => inf_vector_dim(f),
_ => None,
}
}
pub fn supported_btree_data_type(dtype: &DataType) -> bool {
dtype.is_integer()
|| dtype.is_floating()
@@ -171,9 +180,10 @@ pub fn supported_fts_data_type(dtype: &DataType) -> bool {
pub fn supported_vector_data_type(dtype: &DataType) -> bool {
match dtype {
DataType::FixedSizeList(inner, _) => {
DataType::is_floating(inner.data_type()) || *inner.data_type() == DataType::UInt8
DataType::FixedSizeList(field, _) => {
field.data_type().is_floating() || field.data_type() == &DataType::UInt8
}
DataType::List(field) => supported_vector_data_type(field.data_type()),
_ => false,
}
}