mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 05:19:58 +00:00
Compare commits
11 Commits
v0.15.0
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3dc1803c07 | ||
|
|
d0501f65f1 | ||
|
|
4703cc6894 | ||
|
|
493f9ce467 | ||
|
|
5c759505b8 | ||
|
|
bb6a39727e | ||
|
|
d57bed90e5 | ||
|
|
648327e90c | ||
|
|
6c7e81ee57 | ||
|
|
905e9d4738 | ||
|
|
38642e349c |
18
Cargo.toml
18
Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.22.0", "features" = ["dynamodb"] }
|
||||
lance-io = "=0.22.0"
|
||||
lance-index = "=0.22.0"
|
||||
lance-linalg = "=0.22.0"
|
||||
lance-table = "=0.22.0"
|
||||
lance-testing = "=0.22.0"
|
||||
lance-datafusion = "=0.22.0"
|
||||
lance-encoding = "=0.22.0"
|
||||
lance = { "version" = "=0.22.1", "features" = [
|
||||
"dynamodb",
|
||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-io = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-index = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-linalg = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-table = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-testing = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-datafusion = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
lance-encoding = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "53.2", optional = false }
|
||||
arrow-array = "53.2"
|
||||
|
||||
68
node/package-lock.json
generated
68
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,14 +52,14 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.15.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.15.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.15.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.15.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.15.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.15.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.15.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.15.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -330,9 +330,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-4sPAW4p1YFVfURyf0k017l6LRCz+VmN9fVUBy7W27b6EOQ3xuIb3t5xq3JAtslMPWBP3wxP8rKXXDmlbqDg3+g==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0.tgz",
|
||||
"integrity": "sha512-FnBRsCrxvecjhkMQus9M9RQpXyhu1jxQjYGDaqqRIfcUd3ew7ahIR4qk9FyALHmjpPd72xJZgNLjliHtsIX4/w==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -343,9 +343,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-uzGINrBBsZattB4/ZYxdGNkTxNh3MqE6Y4nF762qo0zWWSiu+QNHQ+ZyLAZ2lwrEvwxs8LUaJNmnpn3nocHc1A==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0.tgz",
|
||||
"integrity": "sha512-zy+nt1WBCabVI16u2t3sqGUXBOmnF5ZXMsHa9TWYEXVnbw5112K7/1783DTNA/ZBI/WziUa5jqYQ0GOwkgruqA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -356,9 +356,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-bgphfea8h65vJ+bAL+vb+XEfmjskLZ+trZ3GN4n6SICU7XMGSFPl9xzPLGAj1WsoFCTJHe87DRYQpsWGlOI/LQ==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0.tgz",
|
||||
"integrity": "sha512-2Pbw+z5Ij5QBvmBxmjaT5F2lNHftVWlarDM1bDc4JtgodJ3Js729qnVLQ0yehnlt+hM6aGFEyn8bH5vf6gEvpQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -369,9 +369,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-musl": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-GpmVgqMS9ztNX53z8v0JdZiG6K1cK+mJnGZd3Gzguiavrly4mkYZ8IKNwWP9RmewUMNsFWR0IzD4VR+ojVpjlQ==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0.tgz",
|
||||
"integrity": "sha512-WIvgd2EY2maCdYNHPC0C9RprjNWL83FkQKtn591xixltFk3XKgvBQ2USZW2tXndH/WVdvFQvystmZ3dgUrh8DQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -382,9 +382,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-6Y/39TDv4UDVWnl8UpUJ8mqv9rUNc9Q5VR510I7w34c0ChdWvjqdcy+JFnGrraamE1DA8E6wGEz+5oG0zprkNg==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0.tgz",
|
||||
"integrity": "sha512-Pet3aPE+yQT13Gm0+fh11pgHvImS4X8Uf0zRdzsx0eja7x8j15VrVcZTEVTT4QdBNiZrhXBuiq482NJBsqe6vw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -395,9 +395,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-musl": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-GRdW2dhf6DmynhRojjtQjs8DeARM1WpbZZKXukeofOSMv6JoRBSWKw2DzW5sF/285IMU81B0OXZE75QjLp+VJg==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0.tgz",
|
||||
"integrity": "sha512-BC1RvIoEmyOr7ENp618vs9F05gdN7aKlToJNZnGIoi++hRZ25y39B1xxMXQHDnUL8G+Ur9kJObfQ43nVWqueTQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@@ -408,9 +408,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-arm64-msvc": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-2EmRHuqqj8kC5ArUZztUWWTfNd774zL68btOlyhYL1CAiet5jIeGuFWJifdh+PXfQeLoa4GLW5LwyudIR4IHwA==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0.tgz",
|
||||
"integrity": "sha512-H9BeryZl1aLxldtVP0XyiQJyzKStkuxS6SmIg+zaANr9Dns+LmVxYCz429JLC0DlvBWoYjTfK9WJTgMSZXr0Cg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@@ -421,9 +421,9 @@
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.15.0-beta.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0-beta.0.tgz",
|
||||
"integrity": "sha512-lWq9b7LnWMGO0zDsp3rsLYyAzLooV7zQP77ph9Qv9fF0e4egD5l6SmMsAdQqLQnlhbQjkRjt3XRoDsqI809fcw==",
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0.tgz",
|
||||
"integrity": "sha512-J8JICux2M82OR27i/4YAbEPlvszuE7EnGIU5jmm2+RTFaptKOCshH1C4D4jEXDAaHcUkVgsxyc9lGmGJCkGLhg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.15.0-beta.0",
|
||||
"version": "0.15.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.18.0"
|
||||
current_version = "0.18.1-beta.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.18.0"
|
||||
version = "0.18.1-beta.0"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
||||
dynamic = ["version"]
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"pylance==0.22.0",
|
||||
"pylance==0.22.1b3",
|
||||
"tqdm>=4.27.0",
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections import defaultdict
|
||||
from numpy import nan
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -95,43 +96,22 @@ class LinearCombinationReranker(Reranker):
|
||||
pa.array([nan] * len(vector_results), type=pa.float32()),
|
||||
)
|
||||
return results
|
||||
|
||||
# sort both input tables on _rowid
|
||||
combined_list = []
|
||||
vector_list = vector_results.sort_by("_rowid").to_pylist()
|
||||
fts_list = fts_results.sort_by("_rowid").to_pylist()
|
||||
i, j = 0, 0
|
||||
while i < len(vector_list):
|
||||
if j >= len(fts_list):
|
||||
for vi in vector_list[i:]:
|
||||
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
|
||||
combined_list.append(vi)
|
||||
break
|
||||
|
||||
vi = vector_list[i]
|
||||
fj = fts_list[j]
|
||||
# invert the fts score from relevance to distance
|
||||
inverted_fts_score = self._invert_score(fj["_score"])
|
||||
if vi["_rowid"] == fj["_rowid"]:
|
||||
vi["_relevance_score"] = self._combine_score(
|
||||
vi["_distance"], inverted_fts_score
|
||||
)
|
||||
vi["_score"] = fj["_score"] # keep the original score
|
||||
combined_list.append(vi)
|
||||
i += 1
|
||||
j += 1
|
||||
elif vector_list[i]["_rowid"] < fts_list[j]["_rowid"]:
|
||||
vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
|
||||
combined_list.append(vi)
|
||||
i += 1
|
||||
results = defaultdict()
|
||||
for vector_result in vector_results.to_pylist():
|
||||
results[vector_result["_rowid"]] = vector_result
|
||||
for fts_result in fts_results.to_pylist():
|
||||
row_id = fts_result["_rowid"]
|
||||
if row_id in results:
|
||||
results[row_id]["_score"] = fts_result["_score"]
|
||||
else:
|
||||
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
|
||||
combined_list.append(fj)
|
||||
j += 1
|
||||
if j < len(fts_list) - 1:
|
||||
for fj in fts_list[j:]:
|
||||
fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
|
||||
combined_list.append(fj)
|
||||
results[row_id] = fts_result
|
||||
|
||||
combined_list = []
|
||||
for row_id, result in results.items():
|
||||
vector_score = self._invert_score(result.get("_distance", fill))
|
||||
fts_score = result.get("_score", fill)
|
||||
result["_relevance_score"] = self._combine_score(vector_score, fts_score)
|
||||
combined_list.append(result)
|
||||
|
||||
relevance_score_schema = pa.schema(
|
||||
[
|
||||
@@ -148,10 +128,10 @@ class LinearCombinationReranker(Reranker):
|
||||
tbl = self._keep_relevance_score(tbl)
|
||||
return tbl
|
||||
|
||||
def _combine_score(self, score1, score2):
|
||||
def _combine_score(self, vector_score, fts_score):
|
||||
# these scores represent distance
|
||||
return 1 - (self.weight * score1 + (1 - self.weight) * score2)
|
||||
return 1 - (self.weight * vector_score + (1 - self.weight) * fts_score)
|
||||
|
||||
def _invert_score(self, score: float):
|
||||
def _invert_score(self, dist: float):
|
||||
# Invert the score between relevance and distance
|
||||
return 1 - score
|
||||
return 1 - dist
|
||||
|
||||
@@ -3,6 +3,7 @@ import shutil
|
||||
# --8<-- [start:imports]
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
# --8<-- [end:imports]
|
||||
|
||||
@@ -12,16 +13,32 @@ shutil.rmtree("data/binary_lancedb", ignore_errors=True)
|
||||
def test_binary_vector():
|
||||
# --8<-- [start:sync_binary_vector]
|
||||
db = lancedb.connect("data/binary_lancedb")
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.randint(0, 256, size=16),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = db.create_table("my_binary_vectors", data=data)
|
||||
query = np.random.randint(0, 256, size=16)
|
||||
tbl.search(query).metric("hamming").to_arrow()
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# for dim=256, lance stores every 8 bits in a byte
|
||||
# so the vector field should be a list of 256 / 8 = 32 bytes
|
||||
pa.field("vector", pa.list_(pa.uint8(), 32)),
|
||||
]
|
||||
)
|
||||
tbl = db.create_table("my_binary_vectors", schema=schema)
|
||||
|
||||
data = []
|
||||
for i in range(1024):
|
||||
vector = np.random.randint(0, 2, size=256)
|
||||
# pack the binary vector into bytes to save space
|
||||
packed_vector = np.packbits(vector)
|
||||
data.append(
|
||||
{
|
||||
"id": i,
|
||||
"vector": packed_vector,
|
||||
}
|
||||
)
|
||||
tbl.add(data)
|
||||
|
||||
query = np.random.randint(0, 2, size=256)
|
||||
packed_query = np.packbits(query)
|
||||
tbl.search(packed_query).metric("hamming").to_arrow()
|
||||
# --8<-- [end:sync_binary_vector]
|
||||
db.drop_table("my_binary_vectors")
|
||||
|
||||
@@ -30,15 +47,31 @@ def test_binary_vector():
|
||||
async def test_binary_vector_async():
|
||||
# --8<-- [start:async_binary_vector]
|
||||
db = await lancedb.connect_async("data/binary_lancedb")
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.randint(0, 256, size=16),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = await db.create_table("my_binary_vectors", data=data)
|
||||
query = np.random.randint(0, 256, size=16)
|
||||
await tbl.query().nearest_to(query).distance_type("hamming").to_arrow()
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
# for dim=256, lance stores every 8 bits in a byte
|
||||
# so the vector field should be a list of 256 / 8 = 32 bytes
|
||||
pa.field("vector", pa.list_(pa.uint8(), 32)),
|
||||
]
|
||||
)
|
||||
tbl = await db.create_table("my_binary_vectors", schema=schema)
|
||||
|
||||
data = []
|
||||
for i in range(1024):
|
||||
vector = np.random.randint(0, 2, size=256)
|
||||
# pack the binary vector into bytes to save space
|
||||
packed_vector = np.packbits(vector)
|
||||
data.append(
|
||||
{
|
||||
"id": i,
|
||||
"vector": packed_vector,
|
||||
}
|
||||
)
|
||||
await tbl.add(data)
|
||||
|
||||
query = np.random.randint(0, 2, size=256)
|
||||
packed_query = np.packbits(query)
|
||||
await tbl.query().nearest_to(packed_query).distance_type("hamming").to_arrow()
|
||||
# --8<-- [end:async_binary_vector]
|
||||
await db.drop_table("my_binary_vectors")
|
||||
|
||||
77
python/python/tests/docs/test_multivector.py
Normal file
77
python/python/tests/docs/test_multivector.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import shutil
|
||||
from lancedb.index import IvfPq
|
||||
import pytest
|
||||
|
||||
# --8<-- [start:imports]
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
# --8<-- [end:imports]
|
||||
|
||||
shutil.rmtree("data/multivector_demo", ignore_errors=True)
|
||||
|
||||
|
||||
def test_multivector():
|
||||
# --8<-- [start:sync_multivector]
|
||||
db = lancedb.connect("data/multivector_demo")
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
|
||||
]
|
||||
)
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.random(size=(2, 256)).tolist(),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = db.create_table("my_table", data=data, schema=schema)
|
||||
|
||||
# only cosine similarity is supported for multi-vectors
|
||||
tbl.create_index(metric="cosine")
|
||||
|
||||
# query with single vector
|
||||
query = np.random.random(256)
|
||||
tbl.search(query).to_arrow()
|
||||
|
||||
# query with multiple vectors
|
||||
query = np.random.random(size=(2, 256))
|
||||
tbl.search(query).to_arrow()
|
||||
|
||||
# --8<-- [end:sync_multivector]
|
||||
db.drop_table("my_table")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multivector_async():
|
||||
# --8<-- [start:async_multivector]
|
||||
db = await lancedb.connect_async("data/multivector_demo")
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int64()),
|
||||
pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
|
||||
]
|
||||
)
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"vector": np.random.random(size=(2, 256)).tolist(),
|
||||
}
|
||||
for i in range(1024)
|
||||
]
|
||||
tbl = await db.create_table("my_table", data=data, schema=schema)
|
||||
|
||||
# only cosine similarity is supported for multi-vectors
|
||||
await tbl.create_index(column="vector", config=IvfPq(distance_type="cosine"))
|
||||
|
||||
# query with single vector
|
||||
query = np.random.random(256)
|
||||
await tbl.query().nearest_to(query).to_arrow()
|
||||
|
||||
# query with multiple vectors
|
||||
query = np.random.random(size=(2, 256))
|
||||
|
||||
# --8<-- [end:async_multivector]
|
||||
await db.drop_table("my_table")
|
||||
@@ -3,6 +3,7 @@ import random
|
||||
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from lancedb.conftest import MockTextEmbeddingFunction # noqa
|
||||
from lancedb.embeddings import EmbeddingFunctionRegistry
|
||||
@@ -281,6 +282,31 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_linear_combination(tmp_path, use_tantivy):
|
||||
reranker = LinearCombinationReranker()
|
||||
|
||||
vector_results = pa.Table.from_pydict(
|
||||
{
|
||||
"_rowid": [0, 1, 2, 3, 4],
|
||||
"_distance": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"_text": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
|
||||
fts_results = pa.Table.from_pydict(
|
||||
{
|
||||
"_rowid": [1, 2, 3, 4, 5],
|
||||
"_score": [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
"_text": ["b", "c", "d", "e", "f"],
|
||||
}
|
||||
)
|
||||
|
||||
combined_results = reranker.merge_results(vector_results, fts_results, 1.0)
|
||||
assert len(combined_results) == 6
|
||||
assert "_rowid" in combined_results.column_names
|
||||
assert "_text" in combined_results.column_names
|
||||
assert "_distance" not in combined_results.column_names
|
||||
assert "_score" not in combined_results.column_names
|
||||
assert "_relevance_score" in combined_results.column_names
|
||||
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
|
||||
use crate::query::Select;
|
||||
use crate::table::AddDataMode;
|
||||
use crate::utils::{supported_btree_data_type, supported_vector_data_type};
|
||||
use crate::{Error, Table};
|
||||
use crate::{DistanceType, Error, Table};
|
||||
use arrow_array::RecordBatchReader;
|
||||
use arrow_ipc::reader::FileReader;
|
||||
use arrow_schema::{DataType, SchemaRef};
|
||||
@@ -592,7 +592,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
message: format!("Column {} not found in schema", column),
|
||||
})?;
|
||||
if supported_vector_data_type(field.data_type()) {
|
||||
("IVF_PQ", None)
|
||||
("IVF_PQ", Some(DistanceType::L2))
|
||||
} else if supported_btree_data_type(field.data_type()) {
|
||||
("BTREE", None)
|
||||
} else {
|
||||
|
||||
@@ -73,7 +73,7 @@ use crate::query::{
|
||||
IntoQueryVector, Query, QueryExecutionOptions, Select, VectorQuery, DEFAULT_TOP_K,
|
||||
};
|
||||
use crate::utils::{
|
||||
default_vector_column, supported_bitmap_data_type, supported_btree_data_type,
|
||||
default_vector_column, infer_vector_dim, supported_bitmap_data_type, supported_btree_data_type,
|
||||
supported_fts_data_type, supported_label_list_data_type, supported_vector_data_type,
|
||||
PatchReadParam, PatchWriteParam,
|
||||
};
|
||||
@@ -1370,14 +1370,8 @@ impl NativeTable {
|
||||
let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
|
||||
n
|
||||
} else {
|
||||
match field.data_type() {
|
||||
arrow_schema::DataType::FixedSizeList(_, n) => {
|
||||
Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
|
||||
}
|
||||
_ => Err(Error::Schema {
|
||||
message: format!("Column '{}' is not a FixedSizeList", field.name()),
|
||||
}),
|
||||
}?
|
||||
let dim = infer_vector_dim(field.data_type())?;
|
||||
suggested_num_sub_vectors(dim as u32)
|
||||
};
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
|
||||
|
||||
@@ -188,6 +188,24 @@ pub fn supported_vector_data_type(dtype: &DataType) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: remove this after we expose the same function in Lance.
|
||||
pub fn infer_vector_dim(data_type: &DataType) -> Result<usize> {
|
||||
infer_vector_dim_impl(data_type, false)
|
||||
}
|
||||
|
||||
fn infer_vector_dim_impl(data_type: &DataType, in_list: bool) -> Result<usize> {
|
||||
match (data_type, in_list) {
|
||||
(DataType::FixedSizeList(_, dim), _) => Ok(*dim as usize),
|
||||
(DataType::List(inner), false) => infer_vector_dim_impl(inner.data_type(), true),
|
||||
_ => Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"data type is not a vector (FixedSizeList or List<FixedSizeList>), but {:?}",
|
||||
data_type
|
||||
),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Note: this is temporary until we get a proper datatype conversion in Lance.
|
||||
pub fn string_to_datatype(s: &str) -> Option<DataType> {
|
||||
let data_type = serde_json::Value::String(s.to_string());
|
||||
|
||||
Reference in New Issue
Block a user