mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 13:59:58 +00:00
Compare commits
7 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
222e3264ab | ||
|
|
13505026cb | ||
|
|
b0800b4b71 | ||
|
|
1befebf614 | ||
|
|
1ab60fae7f | ||
|
|
e921c90c1b | ||
|
|
05a4ea646a |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.22.1-beta.2"
|
||||
current_version = "0.22.1-beta.3"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
106
Cargo.lock
generated
106
Cargo.lock
generated
@@ -1153,7 +1153,7 @@ dependencies = [
|
||||
"bitflags 2.9.4",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"itertools 0.11.0",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"log",
|
||||
@@ -2929,6 +2929,18 @@ version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55"
|
||||
|
||||
[[package]]
|
||||
name = "fastbloom"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18c1ddb9231d8554c2d6bdf4cfaabf0c59251658c68b6c95cd52dd0c513a912a"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
"libm",
|
||||
"rand 0.9.2",
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastdivide"
|
||||
version = "0.4.2"
|
||||
@@ -3028,8 +3040,9 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "fsst"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe0a0b1d16ce6b863be8ab766004d89ebf0779fd6ce31b0ef3bbc7fedaaad373"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"rand 0.9.2",
|
||||
@@ -4206,8 +4219,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42171f2af5d377e6bbcc8a8572144ee15b73a8f78ceb6160f1adeabf0d0f3e3c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4270,8 +4284,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-arrow"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25ef9499a1e581112f45fbf743fdc8e24830cda0bd13396b11c71aa6e6cba083"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4289,8 +4304,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-bitpacking"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1101fffd5b161bbdc6e932d6c0a7f94cb1752b0f8cd6d18ef9064052ab901a84"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"paste",
|
||||
@@ -4299,8 +4315,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-core"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "527ee5e6472d058d8c66c702fbe318a3f60f971e652e60dcfc6349bdbc9b0733"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4335,8 +4352,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datafusion"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65a80f7f15f2d941ec7b8253625cbb8e12081ea27584dd1fbc657fb9fb377f7a"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4365,8 +4383,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datagen"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0495c8afa18f246ac4b337c47d7827560283783963dd2177862d91161478fd79"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4383,8 +4402,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-encoding"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e80e9ae49d68b95d58e77d9177f68983dce4f0803ef42840e1631b38dd66adc"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4412,6 +4432,7 @@ dependencies = [
|
||||
"prost-types",
|
||||
"rand 0.9.2",
|
||||
"snafu",
|
||||
"strum",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"xxhash-rust",
|
||||
@@ -4420,8 +4441,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-file"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1707f9f5097b36c82d3a8524bb41c762c80d5dfa5e32aa7bfc6a1c0847a1cce"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4455,8 +4477,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-index"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28ab52586a5a7f5371a5abf4862968231f8c0232ce0780bc456f1ec16e9370f9"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4477,6 +4500,7 @@ dependencies = [
|
||||
"datafusion-sql",
|
||||
"deepsize",
|
||||
"dirs",
|
||||
"fastbloom",
|
||||
"fst",
|
||||
"futures",
|
||||
"half",
|
||||
@@ -4491,6 +4515,7 @@ dependencies = [
|
||||
"lance-io",
|
||||
"lance-linalg",
|
||||
"lance-table",
|
||||
"libm",
|
||||
"log",
|
||||
"num-traits",
|
||||
"object_store",
|
||||
@@ -4507,13 +4532,15 @@ dependencies = [
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"twox-hash",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lance-io"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d606f9f6a7f8ec2cacf28dfce7b2fc39e7db9f0ec77f907b8e47c756e3dd163b"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -4553,8 +4580,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-linalg"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9f1a94a5d966ff1eae817a835e3a57b34f73300f83a43bb28e7e2806695b8ba"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4577,8 +4605,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fac5c0ca6e5c285645465b95fb99fc464a1fd22a6d4b32ae0e0760f06b4b8a7f"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4616,8 +4645,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-testing"
|
||||
version = "0.35.0"
|
||||
source = "git+https://github.com/lancedb/lance.git?tag=v0.35.0-beta.4#e842a8f922b90c298c356dd1c6afdc83ca5253f2"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "384acc1dd13379a2ae24f3e3635d9c1f4fb4dc1534f7ffd2740c268f2eb73455"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
@@ -4628,7 +4658,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb"
|
||||
version = "0.22.1-beta.2"
|
||||
version = "0.22.1-beta.3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4715,7 +4745,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-nodejs"
|
||||
version = "0.22.1-beta.2"
|
||||
version = "0.22.1-beta.3"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-ipc",
|
||||
@@ -4735,7 +4765,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-python"
|
||||
version = "0.25.1-beta.2"
|
||||
version = "0.25.1-beta.3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -7781,6 +7811,15 @@ version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
|
||||
dependencies = [
|
||||
"strum_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.25.3"
|
||||
@@ -8441,6 +8480,9 @@ name = "twox-hash"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
|
||||
dependencies = [
|
||||
"rand 0.9.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
|
||||
16
Cargo.toml
16
Cargo.toml
@@ -15,14 +15,14 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.35.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.35.0", default-features = false, "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance = { "version" = "=0.37.0", default-features = false, "features" = ["dynamodb"] }
|
||||
lance-io = { "version" = "=0.37.0", default-features = false }
|
||||
lance-index = "=0.37.0"
|
||||
lance-linalg = "=0.37.0"
|
||||
lance-table = "=0.37.0"
|
||||
lance-testing = "=0.37.0"
|
||||
lance-datafusion = "=0.37.0"
|
||||
lance-encoding = "=0.37.0"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "55.1", optional = false }
|
||||
arrow-array = "55.1"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
@@ -18,8 +19,12 @@ def run_command(command: str) -> str:
|
||||
|
||||
def get_latest_stable_version() -> str:
|
||||
version_line = run_command("cargo info lance | grep '^version:'")
|
||||
version = version_line.split(" ")[1].strip()
|
||||
return version
|
||||
# Example output: "version: 0.35.0 (latest 0.37.0)"
|
||||
match = re.search(r'\(latest ([0-9.]+)\)', version_line)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# Fallback: use the first version after 'version:'
|
||||
return version_line.split("version:")[1].split()[0].strip()
|
||||
|
||||
|
||||
def get_latest_preview_version() -> str:
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.2</version>
|
||||
<version>0.22.1-beta.3</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.2</version>
|
||||
<version>0.22.1-beta.3</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.2</version>
|
||||
<version>0.22.1-beta.3</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.22.1-beta.2"
|
||||
version = "0.22.1-beta.3"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -1008,5 +1008,64 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(result).toEqual(null);
|
||||
});
|
||||
});
|
||||
|
||||
describe("boolean null handling", function () {
|
||||
it("should handle null values in nullable boolean fields", () => {
|
||||
const { makeArrowTable } = require("../lancedb/arrow");
|
||||
const schema = new Schema([new Field("test", new arrow.Bool(), true)]);
|
||||
|
||||
// Test with all null values
|
||||
const data = [{ test: null }];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
expect(table.numRows).toBe(1);
|
||||
expect(table.schema.names).toEqual(["test"]);
|
||||
expect(table.getChild("test")!.get(0)).toBeNull();
|
||||
});
|
||||
|
||||
it("should handle mixed null and non-null boolean values", () => {
|
||||
const { makeArrowTable } = require("../lancedb/arrow");
|
||||
const schema = new Schema([new Field("test", new Bool(), true)]);
|
||||
|
||||
// Test with mixed values
|
||||
const data = [{ test: true }, { test: null }, { test: false }];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
expect(table.numRows).toBe(3);
|
||||
expect(table.getChild("test")!.get(0)).toBe(true);
|
||||
expect(table.getChild("test")!.get(1)).toBeNull();
|
||||
expect(table.getChild("test")!.get(2)).toBe(false);
|
||||
});
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
// Test for the undefined values bug fix
|
||||
describe("undefined values handling", () => {
|
||||
it("should handle mixed undefined and actual values", () => {
|
||||
const schema = new Schema([
|
||||
new Field("text", new Utf8(), true), // nullable
|
||||
new Field("number", new Int32(), true), // nullable
|
||||
new Field("bool", new Bool(), true), // nullable
|
||||
]);
|
||||
|
||||
const data = [
|
||||
{ text: undefined, number: 42, bool: true },
|
||||
{ text: "hello", number: undefined, bool: false },
|
||||
{ text: "world", number: 123, bool: undefined },
|
||||
];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
const result = table.toArray();
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe(null);
|
||||
expect(result[0].number).toBe(42);
|
||||
expect(result[0].bool).toBe(true);
|
||||
expect(result[1].text).toBe("hello");
|
||||
expect(result[1].number).toBe(null);
|
||||
expect(result[1].bool).toBe(false);
|
||||
expect(result[2].text).toBe("world");
|
||||
expect(result[2].number).toBe(123);
|
||||
expect(result[2].bool).toBe(null);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -705,7 +705,7 @@ function transposeData(
|
||||
}
|
||||
return current;
|
||||
});
|
||||
return makeVector(values, field.type);
|
||||
return makeVector(values, field.type, undefined, field.nullable);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -752,9 +752,30 @@ function makeVector(
|
||||
values: unknown[],
|
||||
type?: DataType,
|
||||
stringAsDictionary?: boolean,
|
||||
nullable?: boolean,
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
): Vector<any> {
|
||||
if (type !== undefined) {
|
||||
// Convert undefined values to null for nullable fields
|
||||
if (nullable) {
|
||||
values = values.map((v) => (v === undefined ? null : v));
|
||||
}
|
||||
|
||||
// workaround for: https://github.com/apache/arrow-js/issues/68
|
||||
if (DataType.isBool(type)) {
|
||||
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
|
||||
if (!hasNonNullValue) {
|
||||
const nullBitmap = new Uint8Array(Math.ceil(values.length / 8));
|
||||
const data = makeData({
|
||||
type: type,
|
||||
length: values.length,
|
||||
nullCount: values.length,
|
||||
nullBitmap,
|
||||
});
|
||||
return arrowMakeVector(data);
|
||||
}
|
||||
}
|
||||
|
||||
// No need for inference, let Arrow create it
|
||||
if (type instanceof Int) {
|
||||
if (DataType.isInt(type) && type.bitWidth === 64) {
|
||||
@@ -879,7 +900,12 @@ async function applyEmbeddingsFromMetadata(
|
||||
for (const field of schema.fields) {
|
||||
if (!(field.name in columns)) {
|
||||
const nullValues = new Array(table.numRows).fill(null);
|
||||
columns[field.name] = makeVector(nullValues, field.type);
|
||||
columns[field.name] = makeVector(
|
||||
nullValues,
|
||||
field.type,
|
||||
undefined,
|
||||
field.nullable,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -943,7 +969,12 @@ async function applyEmbeddings<T>(
|
||||
} else if (schema != null) {
|
||||
const destField = schema.fields.find((f) => f.name === destColumn);
|
||||
if (destField != null) {
|
||||
newColumns[destColumn] = makeVector([], destField.type);
|
||||
newColumns[destColumn] = makeVector(
|
||||
[],
|
||||
destField.type,
|
||||
undefined,
|
||||
destField.nullable,
|
||||
);
|
||||
} else {
|
||||
throw new Error(
|
||||
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.22.1-beta.2",
|
||||
"version": "0.22.1-beta.3",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.25.1-beta.3"
|
||||
current_version = "0.25.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.25.1-beta.3"
|
||||
version = "0.25.1"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -9,6 +9,7 @@ from .linear_combination import LinearCombinationReranker
|
||||
from .openai import OpenaiReranker
|
||||
from .jinaai import JinaReranker
|
||||
from .rrf import RRFReranker
|
||||
from .mrr import MRRReranker
|
||||
from .answerdotai import AnswerdotaiRerankers
|
||||
from .voyageai import VoyageAIReranker
|
||||
|
||||
@@ -23,4 +24,5 @@ __all__ = [
|
||||
"RRFReranker",
|
||||
"AnswerdotaiRerankers",
|
||||
"VoyageAIReranker",
|
||||
"MRRReranker",
|
||||
]
|
||||
|
||||
169
python/python/lancedb/rerankers/mrr.py
Normal file
169
python/python/lancedb/rerankers/mrr.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
|
||||
from typing import Union, List, TYPE_CHECKING
|
||||
import pyarrow as pa
|
||||
import numpy as np
|
||||
|
||||
from collections import defaultdict
|
||||
from .base import Reranker
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..table import LanceVectorQueryBuilder
|
||||
|
||||
|
||||
class MRRReranker(Reranker):
|
||||
"""
|
||||
Reranks the results using Mean Reciprocal Rank (MRR) algorithm based
|
||||
on the scores of vector and FTS search.
|
||||
Algorithm reference - https://en.wikipedia.org/wiki/Mean_reciprocal_rank
|
||||
|
||||
MRR calculates the average of reciprocal ranks across different search results.
|
||||
For each document, it computes the reciprocal of its rank in each system,
|
||||
then takes the mean of these reciprocal ranks as the final score.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
weight_vector : float, default 0.5
|
||||
Weight for vector search results (0.0 to 1.0)
|
||||
weight_fts : float, default 0.5
|
||||
Weight for FTS search results (0.0 to 1.0)
|
||||
Note: weight_vector + weight_fts should equal 1.0
|
||||
return_score : str, default "relevance"
|
||||
Options are "relevance" or "all"
|
||||
The type of score to return. If "relevance", will return only the relevance
|
||||
score. If "all", will return all scores from the vector and FTS search along
|
||||
with the relevance score.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
weight_vector: float = 0.5,
|
||||
weight_fts: float = 0.5,
|
||||
return_score="relevance",
|
||||
):
|
||||
if not (0.0 <= weight_vector <= 1.0):
|
||||
raise ValueError("weight_vector must be between 0.0 and 1.0")
|
||||
if not (0.0 <= weight_fts <= 1.0):
|
||||
raise ValueError("weight_fts must be between 0.0 and 1.0")
|
||||
if abs(weight_vector + weight_fts - 1.0) > 1e-6:
|
||||
raise ValueError("weight_vector + weight_fts must equal 1.0")
|
||||
|
||||
super().__init__(return_score)
|
||||
self.weight_vector = weight_vector
|
||||
self.weight_fts = weight_fts
|
||||
|
||||
def rerank_hybrid(
|
||||
self,
|
||||
query: str, # noqa: F821
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
vector_ids = vector_results["_rowid"].to_pylist() if vector_results else []
|
||||
fts_ids = fts_results["_rowid"].to_pylist() if fts_results else []
|
||||
|
||||
# Maps result_id to list of (type, reciprocal_rank)
|
||||
mrr_score_map = defaultdict(list)
|
||||
|
||||
if vector_ids:
|
||||
for rank, result_id in enumerate(vector_ids, 1):
|
||||
reciprocal_rank = 1.0 / rank
|
||||
mrr_score_map[result_id].append(("vector", reciprocal_rank))
|
||||
|
||||
if fts_ids:
|
||||
for rank, result_id in enumerate(fts_ids, 1):
|
||||
reciprocal_rank = 1.0 / rank
|
||||
mrr_score_map[result_id].append(("fts", reciprocal_rank))
|
||||
|
||||
final_mrr_scores = {}
|
||||
for result_id, scores in mrr_score_map.items():
|
||||
vector_rr = 0.0
|
||||
fts_rr = 0.0
|
||||
|
||||
for score_type, reciprocal_rank in scores:
|
||||
if score_type == "vector":
|
||||
vector_rr = reciprocal_rank
|
||||
elif score_type == "fts":
|
||||
fts_rr = reciprocal_rank
|
||||
|
||||
# If a document doesn't appear, its reciprocal rank is 0
|
||||
weighted_mrr = self.weight_vector * vector_rr + self.weight_fts * fts_rr
|
||||
final_mrr_scores[result_id] = weighted_mrr
|
||||
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_row_ids = combined_results["_rowid"].to_pylist()
|
||||
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||
combined_results = combined_results.append_column(
|
||||
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||
)
|
||||
combined_results = combined_results.sort_by(
|
||||
[("_relevance_score", "descending")]
|
||||
)
|
||||
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
|
||||
return combined_results
|
||||
|
||||
def rerank_multivector(
|
||||
self,
|
||||
vector_results: Union[List[pa.Table], List["LanceVectorQueryBuilder"]],
|
||||
query: str = None,
|
||||
deduplicate: bool = True, # noqa: F821
|
||||
):
|
||||
"""
|
||||
Reranks the results from multiple vector searches using MRR algorithm.
|
||||
Each vector search result is treated as a separate ranking system,
|
||||
and MRR calculates the mean of reciprocal ranks across all systems.
|
||||
This cannot reuse rerank_hybrid because MRR semantics require treating
|
||||
each vector result as a separate ranking system.
|
||||
"""
|
||||
if not all(isinstance(v, type(vector_results[0])) for v in vector_results):
|
||||
raise ValueError(
|
||||
"All elements in vector_results should be of the same type"
|
||||
)
|
||||
|
||||
# avoid circular import
|
||||
if type(vector_results[0]).__name__ == "LanceVectorQueryBuilder":
|
||||
vector_results = [result.to_arrow() for result in vector_results]
|
||||
elif not isinstance(vector_results[0], pa.Table):
|
||||
raise ValueError(
|
||||
"vector_results should be a list of pa.Table or LanceVectorQueryBuilder"
|
||||
)
|
||||
|
||||
if not all("_rowid" in result.column_names for result in vector_results):
|
||||
raise ValueError(
|
||||
"'_rowid' is required for deduplication. \
|
||||
add _rowid to search results like this: \
|
||||
`search().with_row_id(True)`"
|
||||
)
|
||||
|
||||
mrr_score_map = defaultdict(list)
|
||||
|
||||
for result_table in vector_results:
|
||||
result_ids = result_table["_rowid"].to_pylist()
|
||||
for rank, result_id in enumerate(result_ids, 1):
|
||||
reciprocal_rank = 1.0 / rank
|
||||
mrr_score_map[result_id].append(reciprocal_rank)
|
||||
|
||||
final_mrr_scores = {}
|
||||
for result_id, reciprocal_ranks in mrr_score_map.items():
|
||||
mean_rr = np.mean(reciprocal_ranks)
|
||||
final_mrr_scores[result_id] = mean_rr
|
||||
|
||||
combined = pa.concat_tables(vector_results, **self._concat_tables_args)
|
||||
combined = self._deduplicate(combined)
|
||||
|
||||
combined_row_ids = combined["_rowid"].to_pylist()
|
||||
|
||||
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||
combined = combined.append_column(
|
||||
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||
)
|
||||
combined = combined.sort_by([("_relevance_score", "descending")])
|
||||
|
||||
if self.score == "relevance":
|
||||
combined = self._keep_relevance_score(combined)
|
||||
|
||||
return combined
|
||||
@@ -1470,10 +1470,7 @@ class Table(ABC):
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -2835,10 +2832,7 @@ class LanceTable(Table):
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -4298,10 +4292,7 @@ class AsyncTable:
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -4324,10 +4315,19 @@ class AsyncTable:
|
||||
cleanup_since_ms: Optional[int] = None
|
||||
if cleanup_older_than is not None:
|
||||
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
||||
|
||||
if retrain:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"The 'retrain' parameter is deprecated and will be removed in a "
|
||||
"future version.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
return await self._inner.optimize(
|
||||
cleanup_since_ms=cleanup_since_ms,
|
||||
delete_unverified=delete_unverified,
|
||||
retrain=retrain,
|
||||
)
|
||||
|
||||
async def list_indices(self) -> Iterable[IndexConfig]:
|
||||
|
||||
@@ -22,6 +22,7 @@ from lancedb.rerankers import (
|
||||
JinaReranker,
|
||||
AnswerdotaiRerankers,
|
||||
VoyageAIReranker,
|
||||
MRRReranker,
|
||||
)
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
@@ -46,6 +47,7 @@ def get_test_table(tmp_path, use_tantivy):
|
||||
db,
|
||||
"my_table",
|
||||
schema=MyTable,
|
||||
mode="overwrite",
|
||||
)
|
||||
|
||||
# Need to test with a bunch of phrases to make sure sorting is consistent
|
||||
@@ -96,7 +98,7 @@ def get_test_table(tmp_path, use_tantivy):
|
||||
)
|
||||
|
||||
# Create a fts index
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
||||
|
||||
return table, MyTable
|
||||
|
||||
@@ -320,6 +322,34 @@ def test_rrf_reranker(tmp_path, use_tantivy):
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_mrr_reranker(tmp_path, use_tantivy):
|
||||
reranker = MRRReranker()
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
# Test multi-vector part
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
query = "single player experience"
|
||||
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||
rs2 = (
|
||||
table.search(query, vector_column_name="meta_vector")
|
||||
.limit(10)
|
||||
.with_row_id(True)
|
||||
)
|
||||
result = reranker.rerank_multivector([rs1, rs2])
|
||||
assert "_relevance_score" in result.column_names
|
||||
assert len(result) <= 20
|
||||
|
||||
if len(result) > 1:
|
||||
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
|
||||
"The _relevance_score should be descending."
|
||||
)
|
||||
|
||||
# Test with duplicate results
|
||||
result_deduped = reranker.rerank_multivector([rs1, rs2, rs1])
|
||||
assert len(result_deduped) == len(result)
|
||||
|
||||
|
||||
def test_rrf_reranker_distance():
|
||||
data = pa.table(
|
||||
{
|
||||
|
||||
@@ -591,12 +591,11 @@ impl Table {
|
||||
}
|
||||
|
||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
|
||||
pub fn optimize(
|
||||
self_: PyRef<'_, Self>,
|
||||
cleanup_since_ms: Option<u64>,
|
||||
delete_unverified: Option<bool>,
|
||||
retrain: Option<bool>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
let older_than = if let Some(ms) = cleanup_since_ms {
|
||||
@@ -632,10 +631,9 @@ impl Table {
|
||||
.prune
|
||||
.unwrap();
|
||||
inner
|
||||
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
|
||||
Some(true) => OptimizeOptions::retrain(),
|
||||
_ => OptimizeOptions::default(),
|
||||
}))
|
||||
.optimize(lancedb::table::OptimizeAction::Index(
|
||||
OptimizeOptions::default(),
|
||||
))
|
||||
.await
|
||||
.infer_error()?;
|
||||
Ok(OptimizeStats {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.22.1-beta.2"
|
||||
version = "0.22.1-beta.3"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
//! values
|
||||
use std::cmp::max;
|
||||
|
||||
use lance::table::format::{Index, Manifest};
|
||||
use lance::table::format::{IndexMetadata, Manifest};
|
||||
|
||||
use crate::DistanceType;
|
||||
|
||||
@@ -19,7 +19,7 @@ pub struct VectorIndex {
|
||||
}
|
||||
|
||||
impl VectorIndex {
|
||||
pub fn new_from_format(manifest: &Manifest, index: &Index) -> Self {
|
||||
pub fn new_from_format(manifest: &Manifest, index: &IndexMetadata) -> Self {
|
||||
let fields = index
|
||||
.fields
|
||||
.iter()
|
||||
|
||||
@@ -1976,6 +1976,8 @@ impl NativeTable {
|
||||
/// Delete keys from the config
|
||||
pub async fn delete_config_keys(&self, delete_keys: &[&str]) -> Result<()> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
// TODO: update this when we implement metadata APIs
|
||||
#[allow(deprecated)]
|
||||
dataset.delete_config_keys(delete_keys).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1986,6 +1988,8 @@ impl NativeTable {
|
||||
upsert_values: impl IntoIterator<Item = (String, String)>,
|
||||
) -> Result<()> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
// TODO: update this when we implement metadata APIs
|
||||
#[allow(deprecated)]
|
||||
dataset.replace_schema_metadata(upsert_values).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user