Bump version: 0.18.0 → 0.18.1-beta.0

fix: linear reranker applies wrong score to combine (#2035 )
related to #2014 this fixes: - linear reranker may lost some results if the merging consumes all vector results earlier than fts results - linear reranker inverts the fts score but only vector distance can be inverted --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-12-23 05:19:58 +00:00 · 2025-01-17 04:37:23 +00:00 · 2025-01-17 11:33:48 +08:00 · 2025-01-16 12:42:42 -05:00 · 2025-01-16 14:08:04 +08:00 · 2025-01-15 07:37:37 -08:00
13 changed files with 248 additions and 118 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,14 +21,16 @@ categories = ["database-implementations"]
 rust-version = "1.78.0"

 [workspace.dependencies]
-lance = { "version" = "=0.22.0", "features" = ["dynamodb"] }
-lance-io = "=0.22.0"
-lance-index = "=0.22.0"
-lance-linalg = "=0.22.0"
-lance-table = "=0.22.0"
-lance-testing = "=0.22.0"
-lance-datafusion = "=0.22.0"
-lance-encoding = "=0.22.0"
+lance = { "version" = "=0.22.1", "features" = [
+    "dynamodb",
+], git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-io = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-index = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-linalg = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-table = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-testing = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-datafusion = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
+lance-encoding = { version = "=0.22.1", git = "https://github.com/lancedb/lance.git", tag = "v0.22.1-beta.3" }
 # Note that this one does not include pyarrow
 arrow = { version = "53.2", optional = false }
 arrow-array = "53.2"
--- a/node/package-lock.json
+++ b/node/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "vectordb",
-  "version": "0.15.0-beta.0",
+  "version": "0.15.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "vectordb",
-      "version": "0.15.0-beta.0",
+      "version": "0.15.0",
      "cpu": [
        "x64",
        "arm64"
@@ -52,14 +52,14 @@
        "uuid": "^9.0.0"
      },
      "optionalDependencies": {
-        "@lancedb/vectordb-darwin-arm64": "0.15.0-beta.0",
-        "@lancedb/vectordb-darwin-x64": "0.15.0-beta.0",
-        "@lancedb/vectordb-linux-arm64-gnu": "0.15.0-beta.0",
-        "@lancedb/vectordb-linux-arm64-musl": "0.15.0-beta.0",
-        "@lancedb/vectordb-linux-x64-gnu": "0.15.0-beta.0",
-        "@lancedb/vectordb-linux-x64-musl": "0.15.0-beta.0",
-        "@lancedb/vectordb-win32-arm64-msvc": "0.15.0-beta.0",
-        "@lancedb/vectordb-win32-x64-msvc": "0.15.0-beta.0"
+        "@lancedb/vectordb-darwin-arm64": "0.15.0",
+        "@lancedb/vectordb-darwin-x64": "0.15.0",
+        "@lancedb/vectordb-linux-arm64-gnu": "0.15.0",
+        "@lancedb/vectordb-linux-arm64-musl": "0.15.0",
+        "@lancedb/vectordb-linux-x64-gnu": "0.15.0",
+        "@lancedb/vectordb-linux-x64-musl": "0.15.0",
+        "@lancedb/vectordb-win32-arm64-msvc": "0.15.0",
+        "@lancedb/vectordb-win32-x64-msvc": "0.15.0"
      },
      "peerDependencies": {
        "@apache-arrow/ts": "^14.0.2",
@@ -330,9 +330,9 @@
      }
    },
    "node_modules/@lancedb/vectordb-darwin-arm64": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0-beta.0.tgz",
-      "integrity": "sha512-4sPAW4p1YFVfURyf0k017l6LRCz+VmN9fVUBy7W27b6EOQ3xuIb3t5xq3JAtslMPWBP3wxP8rKXXDmlbqDg3+g==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.15.0.tgz",
+      "integrity": "sha512-FnBRsCrxvecjhkMQus9M9RQpXyhu1jxQjYGDaqqRIfcUd3ew7ahIR4qk9FyALHmjpPd72xJZgNLjliHtsIX4/w==",
      "cpu": [
        "arm64"
      ],
@@ -343,9 +343,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-darwin-x64": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0-beta.0.tgz",
-      "integrity": "sha512-uzGINrBBsZattB4/ZYxdGNkTxNh3MqE6Y4nF762qo0zWWSiu+QNHQ+ZyLAZ2lwrEvwxs8LUaJNmnpn3nocHc1A==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.15.0.tgz",
+      "integrity": "sha512-zy+nt1WBCabVI16u2t3sqGUXBOmnF5ZXMsHa9TWYEXVnbw5112K7/1783DTNA/ZBI/WziUa5jqYQ0GOwkgruqA==",
      "cpu": [
        "x64"
      ],
@@ -356,9 +356,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-linux-arm64-gnu": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0-beta.0.tgz",
-      "integrity": "sha512-bgphfea8h65vJ+bAL+vb+XEfmjskLZ+trZ3GN4n6SICU7XMGSFPl9xzPLGAj1WsoFCTJHe87DRYQpsWGlOI/LQ==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.15.0.tgz",
+      "integrity": "sha512-2Pbw+z5Ij5QBvmBxmjaT5F2lNHftVWlarDM1bDc4JtgodJ3Js729qnVLQ0yehnlt+hM6aGFEyn8bH5vf6gEvpQ==",
      "cpu": [
        "arm64"
      ],
@@ -369,9 +369,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-linux-arm64-musl": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0-beta.0.tgz",
-      "integrity": "sha512-GpmVgqMS9ztNX53z8v0JdZiG6K1cK+mJnGZd3Gzguiavrly4mkYZ8IKNwWP9RmewUMNsFWR0IzD4VR+ojVpjlQ==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-musl/-/vectordb-linux-arm64-musl-0.15.0.tgz",
+      "integrity": "sha512-WIvgd2EY2maCdYNHPC0C9RprjNWL83FkQKtn591xixltFk3XKgvBQ2USZW2tXndH/WVdvFQvystmZ3dgUrh8DQ==",
      "cpu": [
        "arm64"
      ],
@@ -382,9 +382,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-linux-x64-gnu": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0-beta.0.tgz",
-      "integrity": "sha512-6Y/39TDv4UDVWnl8UpUJ8mqv9rUNc9Q5VR510I7w34c0ChdWvjqdcy+JFnGrraamE1DA8E6wGEz+5oG0zprkNg==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.15.0.tgz",
+      "integrity": "sha512-Pet3aPE+yQT13Gm0+fh11pgHvImS4X8Uf0zRdzsx0eja7x8j15VrVcZTEVTT4QdBNiZrhXBuiq482NJBsqe6vw==",
      "cpu": [
        "x64"
      ],
@@ -395,9 +395,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-linux-x64-musl": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0-beta.0.tgz",
-      "integrity": "sha512-GRdW2dhf6DmynhRojjtQjs8DeARM1WpbZZKXukeofOSMv6JoRBSWKw2DzW5sF/285IMU81B0OXZE75QjLp+VJg==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-musl/-/vectordb-linux-x64-musl-0.15.0.tgz",
+      "integrity": "sha512-BC1RvIoEmyOr7ENp618vs9F05gdN7aKlToJNZnGIoi++hRZ25y39B1xxMXQHDnUL8G+Ur9kJObfQ43nVWqueTQ==",
      "cpu": [
        "x64"
      ],
@@ -408,9 +408,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-win32-arm64-msvc": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0-beta.0.tgz",
-      "integrity": "sha512-2EmRHuqqj8kC5ArUZztUWWTfNd774zL68btOlyhYL1CAiet5jIeGuFWJifdh+PXfQeLoa4GLW5LwyudIR4IHwA==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-arm64-msvc/-/vectordb-win32-arm64-msvc-0.15.0.tgz",
+      "integrity": "sha512-H9BeryZl1aLxldtVP0XyiQJyzKStkuxS6SmIg+zaANr9Dns+LmVxYCz429JLC0DlvBWoYjTfK9WJTgMSZXr0Cg==",
      "cpu": [
        "arm64"
      ],
@@ -421,9 +421,9 @@
      ]
    },
    "node_modules/@lancedb/vectordb-win32-x64-msvc": {
-      "version": "0.15.0-beta.0",
-      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0-beta.0.tgz",
-      "integrity": "sha512-lWq9b7LnWMGO0zDsp3rsLYyAzLooV7zQP77ph9Qv9fF0e4egD5l6SmMsAdQqLQnlhbQjkRjt3XRoDsqI809fcw==",
+      "version": "0.15.0",
+      "resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.15.0.tgz",
+      "integrity": "sha512-J8JICux2M82OR27i/4YAbEPlvszuE7EnGIU5jmm2+RTFaptKOCshH1C4D4jEXDAaHcUkVgsxyc9lGmGJCkGLhg==",
      "cpu": [
        "x64"
      ],
--- a/nodejs/package-lock.json
+++ b/nodejs/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "@lancedb/lancedb",
-  "version": "0.15.0-beta.0",
+  "version": "0.15.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "@lancedb/lancedb",
-      "version": "0.15.0-beta.0",
+      "version": "0.15.0",
      "cpu": [
        "x64",
        "arm64"
--- a/python/.bumpversion.toml
+++ b/python/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "0.18.0"
+current_version = "0.18.1-beta.0"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lancedb-python"
-version = "0.18.0"
+version = "0.18.1-beta.0"
 edition.workspace = true
 description = "Python bindings for LanceDB"
 license.workspace = true
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ name = "lancedb"
 dynamic = ["version"]
 dependencies = [
    "deprecation",
-    "pylance==0.22.0",
+    "pylance==0.22.1b3",
    "tqdm>=4.27.0",
    "pydantic>=1.10",
    "packaging",
--- a/python/python/lancedb/rerankers/linear_combination.py
+++ b/python/python/lancedb/rerankers/linear_combination.py
@@ -11,6 +11,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.

+from collections import defaultdict
 from numpy import nan
 import pyarrow as pa

@@ -95,43 +96,22 @@ class LinearCombinationReranker(Reranker):
                    pa.array([nan] * len(vector_results), type=pa.float32()),
                )
            return results
-
-        # sort both input tables on _rowid
-        combined_list = []
-        vector_list = vector_results.sort_by("_rowid").to_pylist()
-        fts_list = fts_results.sort_by("_rowid").to_pylist()
-        i, j = 0, 0
-        while i < len(vector_list):
-            if j >= len(fts_list):
-                for vi in vector_list[i:]:
-                    vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
-                    combined_list.append(vi)
-                break
-
-            vi = vector_list[i]
-            fj = fts_list[j]
-            # invert the fts score from relevance to distance
-            inverted_fts_score = self._invert_score(fj["_score"])
-            if vi["_rowid"] == fj["_rowid"]:
-                vi["_relevance_score"] = self._combine_score(
-                    vi["_distance"], inverted_fts_score
-                )
-                vi["_score"] = fj["_score"]  # keep the original score
-                combined_list.append(vi)
-                i += 1
-                j += 1
-            elif vector_list[i]["_rowid"] < fts_list[j]["_rowid"]:
-                vi["_relevance_score"] = self._combine_score(vi["_distance"], fill)
-                combined_list.append(vi)
-                i += 1
+        results = defaultdict()
+        for vector_result in vector_results.to_pylist():
+            results[vector_result["_rowid"]] = vector_result
+        for fts_result in fts_results.to_pylist():
+            row_id = fts_result["_rowid"]
+            if row_id in results:
+                results[row_id]["_score"] = fts_result["_score"]
            else:
-                fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
-                combined_list.append(fj)
-                j += 1
-        if j < len(fts_list) - 1:
-            for fj in fts_list[j:]:
-                fj["_relevance_score"] = self._combine_score(inverted_fts_score, fill)
-                combined_list.append(fj)
+                results[row_id] = fts_result
+
+        combined_list = []
+        for row_id, result in results.items():
+            vector_score = self._invert_score(result.get("_distance", fill))
+            fts_score = result.get("_score", fill)
+            result["_relevance_score"] = self._combine_score(vector_score, fts_score)
+            combined_list.append(result)

        relevance_score_schema = pa.schema(
            [
@@ -148,10 +128,10 @@ class LinearCombinationReranker(Reranker):
            tbl = self._keep_relevance_score(tbl)
        return tbl

-    def _combine_score(self, score1, score2):
+    def _combine_score(self, vector_score, fts_score):
        # these scores represent distance
-        return 1 - (self.weight * score1 + (1 - self.weight) * score2)
+        return 1 - (self.weight * vector_score + (1 - self.weight) * fts_score)

-    def _invert_score(self, score: float):
+    def _invert_score(self, dist: float):
        # Invert the score between relevance and distance
-        return 1 - score
+        return 1 - dist
--- a/python/python/tests/docs/test_binary_vector.py
+++ b/python/python/tests/docs/test_binary_vector.py
@@ -3,6 +3,7 @@ import shutil
 # --8<-- [start:imports]
 import lancedb
 import numpy as np
+import pyarrow as pa
 import pytest
 # --8<-- [end:imports]

@@ -12,16 +13,32 @@ shutil.rmtree("data/binary_lancedb", ignore_errors=True)
 def test_binary_vector():
    # --8<-- [start:sync_binary_vector]
    db = lancedb.connect("data/binary_lancedb")
-    data = [
-        {
-            "id": i,
-            "vector": np.random.randint(0, 256, size=16),
-        }
-        for i in range(1024)
-    ]
-    tbl = db.create_table("my_binary_vectors", data=data)
-    query = np.random.randint(0, 256, size=16)
-    tbl.search(query).metric("hamming").to_arrow()
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            # for dim=256, lance stores every 8 bits in a byte
+            # so the vector field should be a list of 256 / 8 = 32 bytes
+            pa.field("vector", pa.list_(pa.uint8(), 32)),
+        ]
+    )
+    tbl = db.create_table("my_binary_vectors", schema=schema)
+
+    data = []
+    for i in range(1024):
+        vector = np.random.randint(0, 2, size=256)
+        # pack the binary vector into bytes to save space
+        packed_vector = np.packbits(vector)
+        data.append(
+            {
+                "id": i,
+                "vector": packed_vector,
+            }
+        )
+    tbl.add(data)
+
+    query = np.random.randint(0, 2, size=256)
+    packed_query = np.packbits(query)
+    tbl.search(packed_query).metric("hamming").to_arrow()
    # --8<-- [end:sync_binary_vector]
    db.drop_table("my_binary_vectors")

@@ -30,15 +47,31 @@ def test_binary_vector():
 async def test_binary_vector_async():
    # --8<-- [start:async_binary_vector]
    db = await lancedb.connect_async("data/binary_lancedb")
-    data = [
-        {
-            "id": i,
-            "vector": np.random.randint(0, 256, size=16),
-        }
-        for i in range(1024)
-    ]
-    tbl = await db.create_table("my_binary_vectors", data=data)
-    query = np.random.randint(0, 256, size=16)
-    await tbl.query().nearest_to(query).distance_type("hamming").to_arrow()
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            # for dim=256, lance stores every 8 bits in a byte
+            # so the vector field should be a list of 256 / 8 = 32 bytes
+            pa.field("vector", pa.list_(pa.uint8(), 32)),
+        ]
+    )
+    tbl = await db.create_table("my_binary_vectors", schema=schema)
+
+    data = []
+    for i in range(1024):
+        vector = np.random.randint(0, 2, size=256)
+        # pack the binary vector into bytes to save space
+        packed_vector = np.packbits(vector)
+        data.append(
+            {
+                "id": i,
+                "vector": packed_vector,
+            }
+        )
+    await tbl.add(data)
+
+    query = np.random.randint(0, 2, size=256)
+    packed_query = np.packbits(query)
+    await tbl.query().nearest_to(packed_query).distance_type("hamming").to_arrow()
    # --8<-- [end:async_binary_vector]
    await db.drop_table("my_binary_vectors")
--- a/python/python/tests/docs/test_multivector.py
+++ b/python/python/tests/docs/test_multivector.py
@@ -0,0 +1,77 @@
+import shutil
+from lancedb.index import IvfPq
+import pytest
+
+# --8<-- [start:imports]
+import lancedb
+import numpy as np
+import pyarrow as pa
+# --8<-- [end:imports]
+
+shutil.rmtree("data/multivector_demo", ignore_errors=True)
+
+
+def test_multivector():
+    # --8<-- [start:sync_multivector]
+    db = lancedb.connect("data/multivector_demo")
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
+        ]
+    )
+    data = [
+        {
+            "id": i,
+            "vector": np.random.random(size=(2, 256)).tolist(),
+        }
+        for i in range(1024)
+    ]
+    tbl = db.create_table("my_table", data=data, schema=schema)
+
+    # only cosine similarity is supported for multi-vectors
+    tbl.create_index(metric="cosine")
+
+    # query with single vector
+    query = np.random.random(256)
+    tbl.search(query).to_arrow()
+
+    # query with multiple vectors
+    query = np.random.random(size=(2, 256))
+    tbl.search(query).to_arrow()
+
+    # --8<-- [end:sync_multivector]
+    db.drop_table("my_table")
+
+
+@pytest.mark.asyncio
+async def test_multivector_async():
+    # --8<-- [start:async_multivector]
+    db = await lancedb.connect_async("data/multivector_demo")
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field("vector", pa.list_(pa.list_(pa.float32(), 256))),
+        ]
+    )
+    data = [
+        {
+            "id": i,
+            "vector": np.random.random(size=(2, 256)).tolist(),
+        }
+        for i in range(1024)
+    ]
+    tbl = await db.create_table("my_table", data=data, schema=schema)
+
+    # only cosine similarity is supported for multi-vectors
+    await tbl.create_index(column="vector", config=IvfPq(distance_type="cosine"))
+
+    # query with single vector
+    query = np.random.random(256)
+    await tbl.query().nearest_to(query).to_arrow()
+
+    # query with multiple vectors
+    query = np.random.random(size=(2, 256))
+
+    # --8<-- [end:async_multivector]
+    await db.drop_table("my_table")
--- a/python/python/tests/test_rerankers.py
+++ b/python/python/tests/test_rerankers.py
@@ -3,6 +3,7 @@ import random

 import lancedb
 import numpy as np
+import pyarrow as pa
 import pytest
 from lancedb.conftest import MockTextEmbeddingFunction  # noqa
 from lancedb.embeddings import EmbeddingFunctionRegistry
@@ -281,6 +282,31 @@ def _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy):
@pytest.mark.parametrize("use_tantivy", [True, False])
 def test_linear_combination(tmp_path, use_tantivy):
    reranker = LinearCombinationReranker()
+
+    vector_results = pa.Table.from_pydict(
+        {
+            "_rowid": [0, 1, 2, 3, 4],
+            "_distance": [0.1, 0.2, 0.3, 0.4, 0.5],
+            "_text": ["a", "b", "c", "d", "e"],
+        }
+    )
+
+    fts_results = pa.Table.from_pydict(
+        {
+            "_rowid": [1, 2, 3, 4, 5],
+            "_score": [0.1, 0.2, 0.3, 0.4, 0.5],
+            "_text": ["b", "c", "d", "e", "f"],
+        }
+    )
+
+    combined_results = reranker.merge_results(vector_results, fts_results, 1.0)
+    assert len(combined_results) == 6
+    assert "_rowid" in combined_results.column_names
+    assert "_text" in combined_results.column_names
+    assert "_distance" not in combined_results.column_names
+    assert "_score" not in combined_results.column_names
+    assert "_relevance_score" in combined_results.column_names
+
    _run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)


--- a/rust/lancedb/src/remote/table.rs
+++ b/rust/lancedb/src/remote/table.rs
@@ -6,7 +6,7 @@ use crate::index::IndexStatistics;
 use crate::query::Select;
 use crate::table::AddDataMode;
 use crate::utils::{supported_btree_data_type, supported_vector_data_type};
-use crate::{Error, Table};
+use crate::{DistanceType, Error, Table};
 use arrow_array::RecordBatchReader;
 use arrow_ipc::reader::FileReader;
 use arrow_schema::{DataType, SchemaRef};
@@ -592,7 +592,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
                        message: format!("Column {} not found in schema", column),
                    })?;
                if supported_vector_data_type(field.data_type()) {
-                    ("IVF_PQ", None)
+                    ("IVF_PQ", Some(DistanceType::L2))
                } else if supported_btree_data_type(field.data_type()) {
                    ("BTREE", None)
                } else {
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -73,7 +73,7 @@ use crate::query::{
    IntoQueryVector, Query, QueryExecutionOptions, Select, VectorQuery, DEFAULT_TOP_K,
 };
 use crate::utils::{
-    default_vector_column, supported_bitmap_data_type, supported_btree_data_type,
+    default_vector_column, infer_vector_dim, supported_bitmap_data_type, supported_btree_data_type,
    supported_fts_data_type, supported_label_list_data_type, supported_vector_data_type,
    PatchReadParam, PatchWriteParam,
 };
@@ -1370,14 +1370,8 @@ impl NativeTable {
        let num_sub_vectors: u32 = if let Some(n) = index.num_sub_vectors {
            n
        } else {
-            match field.data_type() {
-                arrow_schema::DataType::FixedSizeList(_, n) => {
-                    Ok::<u32, Error>(suggested_num_sub_vectors(*n as u32))
-                }
-                _ => Err(Error::Schema {
-                    message: format!("Column '{}' is not a FixedSizeList", field.name()),
-                }),
-            }?
+            let dim = infer_vector_dim(field.data_type())?;
+            suggested_num_sub_vectors(dim as u32)
        };
        let mut dataset = self.dataset.get_mut().await?;
        let lance_idx_params = lance::index::vector::VectorIndexParams::ivf_pq(
--- a/rust/lancedb/src/utils.rs
+++ b/rust/lancedb/src/utils.rs
@@ -188,6 +188,24 @@ pub fn supported_vector_data_type(dtype: &DataType) -> bool {
    }
 }

+// TODO: remove this after we expose the same function in Lance.
+pub fn infer_vector_dim(data_type: &DataType) -> Result<usize> {
+    infer_vector_dim_impl(data_type, false)
+}
+
+fn infer_vector_dim_impl(data_type: &DataType, in_list: bool) -> Result<usize> {
+    match (data_type, in_list) {
+        (DataType::FixedSizeList(_, dim), _) => Ok(*dim as usize),
+        (DataType::List(inner), false) => infer_vector_dim_impl(inner.data_type(), true),
+        _ => Err(Error::InvalidInput {
+            message: format!(
+                "data type is not a vector (FixedSizeList or List<FixedSizeList>), but {:?}",
+                data_type
+            ),
+        }),
+    }
+}
+
 /// Note: this is temporary until we get a proper datatype conversion in Lance.
 pub fn string_to_datatype(s: &str) -> Option<DataType> {
    let data_type = serde_json::Value::String(s.to_string());
Author	SHA1	Message	Date
Lance Release	3dc1803c07	Bump version: 0.18.0 → 0.18.1-beta.0	2025-01-17 04:37:23 +00:00
BubbleCal	d0501f65f1	fix: linear reranker applies wrong score to combine (#2035 ) related to #2014 this fixes: - linear reranker may lost some results if the merging consumes all vector results earlier than fts results - linear reranker inverts the fts score but only vector distance can be inverted --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com>	2025-01-17 11:33:48 +08:00
Bert	4703cc6894	chore: upgrade lance to v0.22.1-beta.3 (#2038 )	2025-01-16 12:42:42 -05:00
BubbleCal	493f9ce467	fix: can't infer the vector column for multivector (#2026 ) Signed-off-by: BubbleCal <bubble-cal@outlook.com>	2025-01-16 14:08:04 +08:00
Weston Pace	5c759505b8	feat: upgrade lance 0.22.1b1 (#2029 ) Now the version actually exists :)	2025-01-15 07:37:37 -08:00
BubbleCal	bb6a39727e	fix: missing distance type for auto index on RemoteTable (#2027 ) Signed-off-by: BubbleCal <bubble-cal@outlook.com>	2025-01-15 20:28:55 +08:00
BubbleCal	d57bed90e5	docs: add missing example code (#2025 )	2025-01-14 21:17:05 -08:00
BubbleCal	648327e90c	docs: show how to pack bits for binary vector (#2020 ) Signed-off-by: BubbleCal <bubble-cal@outlook.com>	2025-01-14 09:00:57 -08:00
Lance Release	6c7e81ee57	Updating package-lock.json	2025-01-14 02:14:37 +00:00
Lance Release	905e9d4738	Updating package-lock.json	2025-01-14 01:03:49 +00:00
Lance Release	38642e349c	Updating package-lock.json	2025-01-14 01:03:33 +00:00