Compare commits

..

1 Commits

Author SHA1 Message Date
Lance Release
69cf220bc4 Bump version: 0.22.1-beta.2 → 0.22.1-beta.3 2025-09-22 04:48:19 +00:00
30 changed files with 86 additions and 426 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.22.1"
current_version = "0.22.1-beta.3"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

106
Cargo.lock generated
View File

@@ -1153,7 +1153,7 @@ dependencies = [
"bitflags 2.9.4",
"cexpr",
"clang-sys",
"itertools 0.12.1",
"itertools 0.11.0",
"lazy_static",
"lazycell",
"log",
@@ -2929,18 +2929,6 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
[[package]]
name = "fastbloom"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18c1ddb9231d8554c2d6bdf4cfaabf0c59251658c68b6c95cd52dd0c513a912a"
dependencies = [
"getrandom 0.3.3",
"libm",
"rand 0.9.2",
"siphasher",
]
[[package]]
name = "fastdivide"
version = "0.4.2"
@@ -3040,9 +3028,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe0a0b1d16ce6b863be8ab766004d89ebf0779fd6ce31b0ef3bbc7fedaaad373"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-array",
"rand 0.9.2",
@@ -4219,9 +4206,8 @@ dependencies = [
[[package]]
name = "lance"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42171f2af5d377e6bbcc8a8572144ee15b73a8f78ceb6160f1adeabf0d0f3e3c"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow",
"arrow-arith",
@@ -4284,9 +4270,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25ef9499a1e581112f45fbf743fdc8e24830cda0bd13396b11c71aa6e6cba083"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4304,9 +4289,8 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1101fffd5b161bbdc6e932d6c0a7f94cb1752b0f8cd6d18ef9064052ab901a84"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrayref",
"paste",
@@ -4315,9 +4299,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527ee5e6472d058d8c66c702fbe318a3f60f971e652e60dcfc6349bdbc9b0733"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4352,9 +4335,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65a80f7f15f2d941ec7b8253625cbb8e12081ea27584dd1fbc657fb9fb377f7a"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow",
"arrow-array",
@@ -4383,9 +4365,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0495c8afa18f246ac4b337c47d7827560283783963dd2177862d91161478fd79"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow",
"arrow-array",
@@ -4402,9 +4383,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e80e9ae49d68b95d58e77d9177f68983dce4f0803ef42840e1631b38dd66adc"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4432,7 +4412,6 @@ dependencies = [
"prost-types",
"rand 0.9.2",
"snafu",
"strum",
"tokio",
"tracing",
"xxhash-rust",
@@ -4441,9 +4420,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1707f9f5097b36c82d3a8524bb41c762c80d5dfa5e32aa7bfc6a1c0847a1cce"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4477,9 +4455,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28ab52586a5a7f5371a5abf4862968231f8c0232ce0780bc456f1ec16e9370f9"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow",
"arrow-array",
@@ -4500,7 +4477,6 @@ dependencies = [
"datafusion-sql",
"deepsize",
"dirs",
"fastbloom",
"fst",
"futures",
"half",
@@ -4515,7 +4491,6 @@ dependencies = [
"lance-io",
"lance-linalg",
"lance-table",
"libm",
"log",
"num-traits",
"object_store",
@@ -4532,15 +4507,13 @@ dependencies = [
"tempfile",
"tokio",
"tracing",
"twox-hash",
"uuid",
]
[[package]]
name = "lance-io"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d606f9f6a7f8ec2cacf28dfce7b2fc39e7db9f0ec77f907b8e47c756e3dd163b"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow",
"arrow-arith",
@@ -4580,9 +4553,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9f1a94a5d966ff1eae817a835e3a57b34f73300f83a43bb28e7e2806695b8ba"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4605,9 +4577,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fac5c0ca6e5c285645465b95fb99fc464a1fd22a6d4b32ae0e0760f06b4b8a7f"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow",
"arrow-array",
@@ -4645,9 +4616,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384acc1dd13379a2ae24f3e3635d9c1f4fb4dc1534f7ffd2740c268f2eb73455"
version = "0.35.0"
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4658,7 +4628,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.22.1-beta.3"
version = "0.22.1-beta.2"
dependencies = [
"arrow",
"arrow-array",
@@ -4745,7 +4715,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.22.1-beta.3"
version = "0.22.1-beta.2"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4765,7 +4735,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.25.1-beta.3"
version = "0.25.1-beta.2"
dependencies = [
"arrow",
"async-trait",
@@ -7811,15 +7781,6 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
dependencies = [
"strum_macros",
]
[[package]]
name = "strum_macros"
version = "0.25.3"
@@ -8480,9 +8441,6 @@ name = "twox-hash"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
dependencies = [
"rand 0.9.2",
]
[[package]]
name = "typenum"

View File

@@ -15,14 +15,14 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.37.0", default-features = false, "features" = ["dynamodb"] }
lance-io = { "version" = "=0.37.0", default-features = false }
lance-index = "=0.37.0"
lance-linalg = "=0.37.0"
lance-table = "=0.37.0"
lance-testing = "=0.37.0"
lance-datafusion = "=0.37.0"
lance-encoding = "=0.37.0"
lance = { "version" = "=0.35.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-io = { "version" = "=0.35.0", default-features = false, "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-index = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-linalg = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-table = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-testing = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-datafusion = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
lance-encoding = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
# Note that this one does not include pyarrow
arrow = { version = "55.1", optional = false }
arrow-array = "55.1"

View File

@@ -1,5 +1,4 @@
import argparse
import re
import sys
import json
@@ -19,12 +18,8 @@ def run_command(command: str) -> str:
def get_latest_stable_version() -> str:
version_line = run_command("cargo info lance | grep '^version:'")
# Example output: "version: 0.35.0 (latest 0.37.0)"
match = re.search(r'\(latest ([0-9.]+)\)', version_line)
if match:
return match.group(1)
# Fallback: use the first version after 'version:'
return version_line.split("version:")[1].split()[0].strip()
version = version_line.split(" ")[1].strip()
return version
def get_latest_preview_version() -> str:

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.1-final.0</version>
<version>0.22.1-beta.3</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.1-final.0</version>
<version>0.22.1-beta.3</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.22.1-final.0</version>
<version>0.22.1-beta.3</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.22.1"
version = "0.22.1-beta.3"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -1008,64 +1008,5 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(result).toEqual(null);
});
});
describe("boolean null handling", function () {
it("should handle null values in nullable boolean fields", () => {
const { makeArrowTable } = require("../lancedb/arrow");
const schema = new Schema([new Field("test", new arrow.Bool(), true)]);
// Test with all null values
const data = [{ test: null }];
const table = makeArrowTable(data, { schema });
expect(table.numRows).toBe(1);
expect(table.schema.names).toEqual(["test"]);
expect(table.getChild("test")!.get(0)).toBeNull();
});
it("should handle mixed null and non-null boolean values", () => {
const { makeArrowTable } = require("../lancedb/arrow");
const schema = new Schema([new Field("test", new Bool(), true)]);
// Test with mixed values
const data = [{ test: true }, { test: null }, { test: false }];
const table = makeArrowTable(data, { schema });
expect(table.numRows).toBe(3);
expect(table.getChild("test")!.get(0)).toBe(true);
expect(table.getChild("test")!.get(1)).toBeNull();
expect(table.getChild("test")!.get(2)).toBe(false);
});
});
},
);
// Test for the undefined values bug fix
describe("undefined values handling", () => {
it("should handle mixed undefined and actual values", () => {
const schema = new Schema([
new Field("text", new Utf8(), true), // nullable
new Field("number", new Int32(), true), // nullable
new Field("bool", new Bool(), true), // nullable
]);
const data = [
{ text: undefined, number: 42, bool: true },
{ text: "hello", number: undefined, bool: false },
{ text: "world", number: 123, bool: undefined },
];
const table = makeArrowTable(data, { schema });
const result = table.toArray();
expect(result).toHaveLength(3);
expect(result[0].text).toBe(null);
expect(result[0].number).toBe(42);
expect(result[0].bool).toBe(true);
expect(result[1].text).toBe("hello");
expect(result[1].number).toBe(null);
expect(result[1].bool).toBe(false);
expect(result[2].text).toBe("world");
expect(result[2].number).toBe(123);
expect(result[2].bool).toBe(null);
});
});

View File

@@ -705,7 +705,7 @@ function transposeData(
}
return current;
});
return makeVector(values, field.type, undefined, field.nullable);
return makeVector(values, field.type);
}
}
@@ -752,30 +752,9 @@ function makeVector(
values: unknown[],
type?: DataType,
stringAsDictionary?: boolean,
nullable?: boolean,
// biome-ignore lint/suspicious/noExplicitAny: skip
): Vector<any> {
if (type !== undefined) {
// Convert undefined values to null for nullable fields
if (nullable) {
values = values.map((v) => (v === undefined ? null : v));
}
// workaround for: https://github.com/apache/arrow-js/issues/68
if (DataType.isBool(type)) {
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
if (!hasNonNullValue) {
const nullBitmap = new Uint8Array(Math.ceil(values.length / 8));
const data = makeData({
type: type,
length: values.length,
nullCount: values.length,
nullBitmap,
});
return arrowMakeVector(data);
}
}
// No need for inference, let Arrow create it
if (type instanceof Int) {
if (DataType.isInt(type) && type.bitWidth === 64) {
@@ -900,12 +879,7 @@ async function applyEmbeddingsFromMetadata(
for (const field of schema.fields) {
if (!(field.name in columns)) {
const nullValues = new Array(table.numRows).fill(null);
columns[field.name] = makeVector(
nullValues,
field.type,
undefined,
field.nullable,
);
columns[field.name] = makeVector(nullValues, field.type);
}
}
@@ -969,12 +943,7 @@ async function applyEmbeddings<T>(
} else if (schema != null) {
const destField = schema.fields.find((f) => f.name === destColumn);
if (destField != null) {
newColumns[destColumn] = makeVector(
[],
destField.type,
undefined,
destField.nullable,
);
newColumns[destColumn] = makeVector([], destField.type);
} else {
throw new Error(
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.22.1",
"version": "0.22.1-beta.3",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.22.1-beta.3",
"version": "0.22.1-beta.2",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.22.1-beta.3",
"version": "0.22.1-beta.2",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.22.1",
"version": "0.22.1-beta.3",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.25.1"
current_version = "0.25.1-beta.3"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.25.1"
version = "0.25.1-beta.3"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -9,7 +9,6 @@ from .linear_combination import LinearCombinationReranker
from .openai import OpenaiReranker
from .jinaai import JinaReranker
from .rrf import RRFReranker
from .mrr import MRRReranker
from .answerdotai import AnswerdotaiRerankers
from .voyageai import VoyageAIReranker
@@ -24,5 +23,4 @@ __all__ = [
"RRFReranker",
"AnswerdotaiRerankers",
"VoyageAIReranker",
"MRRReranker",
]

View File

@@ -1,169 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from typing import Union, List, TYPE_CHECKING
import pyarrow as pa
import numpy as np
from collections import defaultdict
from .base import Reranker
if TYPE_CHECKING:
from ..table import LanceVectorQueryBuilder
class MRRReranker(Reranker):
"""
Reranks the results using Mean Reciprocal Rank (MRR) algorithm based
on the scores of vector and FTS search.
Algorithm reference - https://en.wikipedia.org/wiki/Mean_reciprocal_rank
MRR calculates the average of reciprocal ranks across different search results.
For each document, it computes the reciprocal of its rank in each system,
then takes the mean of these reciprocal ranks as the final score.
Parameters
----------
weight_vector : float, default 0.5
Weight for vector search results (0.0 to 1.0)
weight_fts : float, default 0.5
Weight for FTS search results (0.0 to 1.0)
Note: weight_vector + weight_fts should equal 1.0
return_score : str, default "relevance"
Options are "relevance" or "all"
The type of score to return. If "relevance", will return only the relevance
score. If "all", will return all scores from the vector and FTS search along
with the relevance score.
"""
def __init__(
self,
weight_vector: float = 0.5,
weight_fts: float = 0.5,
return_score="relevance",
):
if not (0.0 <= weight_vector <= 1.0):
raise ValueError("weight_vector must be between 0.0 and 1.0")
if not (0.0 <= weight_fts <= 1.0):
raise ValueError("weight_fts must be between 0.0 and 1.0")
if abs(weight_vector + weight_fts - 1.0) > 1e-6:
raise ValueError("weight_vector + weight_fts must equal 1.0")
super().__init__(return_score)
self.weight_vector = weight_vector
self.weight_fts = weight_fts
def rerank_hybrid(
self,
query: str, # noqa: F821
vector_results: pa.Table,
fts_results: pa.Table,
):
vector_ids = vector_results["_rowid"].to_pylist() if vector_results else []
fts_ids = fts_results["_rowid"].to_pylist() if fts_results else []
# Maps result_id to list of (type, reciprocal_rank)
mrr_score_map = defaultdict(list)
if vector_ids:
for rank, result_id in enumerate(vector_ids, 1):
reciprocal_rank = 1.0 / rank
mrr_score_map[result_id].append(("vector", reciprocal_rank))
if fts_ids:
for rank, result_id in enumerate(fts_ids, 1):
reciprocal_rank = 1.0 / rank
mrr_score_map[result_id].append(("fts", reciprocal_rank))
final_mrr_scores = {}
for result_id, scores in mrr_score_map.items():
vector_rr = 0.0
fts_rr = 0.0
for score_type, reciprocal_rank in scores:
if score_type == "vector":
vector_rr = reciprocal_rank
elif score_type == "fts":
fts_rr = reciprocal_rank
# If a document doesn't appear, its reciprocal rank is 0
weighted_mrr = self.weight_vector * vector_rr + self.weight_fts * fts_rr
final_mrr_scores[result_id] = weighted_mrr
combined_results = self.merge_results(vector_results, fts_results)
combined_row_ids = combined_results["_rowid"].to_pylist()
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
combined_results = combined_results.append_column(
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
)
combined_results = combined_results.sort_by(
[("_relevance_score", "descending")]
)
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
return combined_results
def rerank_multivector(
self,
vector_results: Union[List[pa.Table], List["LanceVectorQueryBuilder"]],
query: str = None,
deduplicate: bool = True, # noqa: F821
):
"""
Reranks the results from multiple vector searches using MRR algorithm.
Each vector search result is treated as a separate ranking system,
and MRR calculates the mean of reciprocal ranks across all systems.
This cannot reuse rerank_hybrid because MRR semantics require treating
each vector result as a separate ranking system.
"""
if not all(isinstance(v, type(vector_results[0])) for v in vector_results):
raise ValueError(
"All elements in vector_results should be of the same type"
)
# avoid circular import
if type(vector_results[0]).__name__ == "LanceVectorQueryBuilder":
vector_results = [result.to_arrow() for result in vector_results]
elif not isinstance(vector_results[0], pa.Table):
raise ValueError(
"vector_results should be a list of pa.Table or LanceVectorQueryBuilder"
)
if not all("_rowid" in result.column_names for result in vector_results):
raise ValueError(
"'_rowid' is required for deduplication. \
add _rowid to search results like this: \
`search().with_row_id(True)`"
)
mrr_score_map = defaultdict(list)
for result_table in vector_results:
result_ids = result_table["_rowid"].to_pylist()
for rank, result_id in enumerate(result_ids, 1):
reciprocal_rank = 1.0 / rank
mrr_score_map[result_id].append(reciprocal_rank)
final_mrr_scores = {}
for result_id, reciprocal_ranks in mrr_score_map.items():
mean_rr = np.mean(reciprocal_ranks)
final_mrr_scores[result_id] = mean_rr
combined = pa.concat_tables(vector_results, **self._concat_tables_args)
combined = self._deduplicate(combined)
combined_row_ids = combined["_rowid"].to_pylist()
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
combined = combined.append_column(
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
)
combined = combined.sort_by([("_relevance_score", "descending")])
if self.score == "relevance":
combined = self._keep_relevance_score(combined)
return combined

View File

@@ -1470,7 +1470,10 @@ class Table(ABC):
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
retrain: bool, default False
This parameter is no longer used and is deprecated.
If True, retrain the vector indices, this would refine the IVF clustering
and quantization, which may improve the search accuracy. It's faster than
re-creating the index from scratch, so it's recommended to try this first,
when the data distribution has changed significantly.
Experimental API
----------------
@@ -2832,7 +2835,10 @@ class LanceTable(Table):
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
retrain: bool, default False
This parameter is no longer used and is deprecated.
If True, retrain the vector indices, this would refine the IVF clustering
and quantization, which may improve the search accuracy. It's faster than
re-creating the index from scratch, so it's recommended to try this first,
when the data distribution has changed significantly.
Experimental API
----------------
@@ -4292,7 +4298,10 @@ class AsyncTable:
be deleted unless they are at least 7 days old. If delete_unverified is True
then these files will be deleted regardless of their age.
retrain: bool, default False
This parameter is no longer used and is deprecated.
If True, retrain the vector indices, this would refine the IVF clustering
and quantization, which may improve the search accuracy. It's faster than
re-creating the index from scratch, so it's recommended to try this first,
when the data distribution has changed significantly.
Experimental API
----------------
@@ -4315,19 +4324,10 @@ class AsyncTable:
cleanup_since_ms: Optional[int] = None
if cleanup_older_than is not None:
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
if retrain:
import warnings
warnings.warn(
"The 'retrain' parameter is deprecated and will be removed in a "
"future version.",
DeprecationWarning,
)
return await self._inner.optimize(
cleanup_since_ms=cleanup_since_ms,
delete_unverified=delete_unverified,
retrain=retrain,
)
async def list_indices(self) -> Iterable[IndexConfig]:

View File

@@ -22,7 +22,6 @@ from lancedb.rerankers import (
JinaReranker,
AnswerdotaiRerankers,
VoyageAIReranker,
MRRReranker,
)
from lancedb.table import LanceTable
@@ -47,7 +46,6 @@ def get_test_table(tmp_path, use_tantivy):
db,
"my_table",
schema=MyTable,
mode="overwrite",
)
# Need to test with a bunch of phrases to make sure sorting is consistent
@@ -98,7 +96,7 @@ def get_test_table(tmp_path, use_tantivy):
)
# Create a fts index
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
table.create_fts_index("text", use_tantivy=use_tantivy)
return table, MyTable
@@ -322,34 +320,6 @@ def test_rrf_reranker(tmp_path, use_tantivy):
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_mrr_reranker(tmp_path, use_tantivy):
reranker = MRRReranker()
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
# Test multi-vector part
table, schema = get_test_table(tmp_path, use_tantivy)
query = "single player experience"
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
rs2 = (
table.search(query, vector_column_name="meta_vector")
.limit(10)
.with_row_id(True)
)
result = reranker.rerank_multivector([rs1, rs2])
assert "_relevance_score" in result.column_names
assert len(result) <= 20
if len(result) > 1:
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
"The _relevance_score should be descending."
)
# Test with duplicate results
result_deduped = reranker.rerank_multivector([rs1, rs2, rs1])
assert len(result_deduped) == len(result)
def test_rrf_reranker_distance():
data = pa.table(
{

View File

@@ -591,11 +591,12 @@ impl Table {
}
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
pub fn optimize(
self_: PyRef<'_, Self>,
cleanup_since_ms: Option<u64>,
delete_unverified: Option<bool>,
retrain: Option<bool>,
) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
let older_than = if let Some(ms) = cleanup_since_ms {
@@ -631,9 +632,10 @@ impl Table {
.prune
.unwrap();
inner
.optimize(lancedb::table::OptimizeAction::Index(
OptimizeOptions::default(),
))
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
Some(true) => OptimizeOptions::retrain(),
_ => OptimizeOptions::default(),
}))
.await
.infer_error()?;
Ok(OptimizeStats {

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.22.1"
version = "0.22.1-beta.3"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -8,7 +8,7 @@
//! values
use std::cmp::max;
use lance::table::format::{IndexMetadata, Manifest};
use lance::table::format::{Index, Manifest};
use crate::DistanceType;
@@ -19,7 +19,7 @@ pub struct VectorIndex {
}
impl VectorIndex {
pub fn new_from_format(manifest: &Manifest, index: &IndexMetadata) -> Self {
pub fn new_from_format(manifest: &Manifest, index: &Index) -> Self {
let fields = index
.fields
.iter()

View File

@@ -1976,8 +1976,6 @@ impl NativeTable {
/// Delete keys from the config
pub async fn delete_config_keys(&self, delete_keys: &[&str]) -> Result<()> {
let mut dataset = self.dataset.get_mut().await?;
// TODO: update this when we implement metadata APIs
#[allow(deprecated)]
dataset.delete_config_keys(delete_keys).await?;
Ok(())
}
@@ -1988,8 +1986,6 @@ impl NativeTable {
upsert_values: impl IntoIterator<Item = (String, String)>,
) -> Result<()> {
let mut dataset = self.dataset.get_mut().await?;
// TODO: update this when we implement metadata APIs
#[allow(deprecated)]
dataset.replace_schema_metadata(upsert_values).await?;
Ok(())
}