Compare commits

...

10 Commits

Author SHA1 Message Date
Lance Release
ce24457531 Bump version: 0.24.1 → 0.24.2-beta.0 2025-07-18 16:02:37 +00:00
BubbleCal
087fe6343d test: fix random data may break test case (#2514)
this test adds a new vector and then performs vector search with
distance range.
this may fail if the new vector becomes the closest one to the query
vector

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-07-18 16:15:06 +08:00
Wyatt Alt
ab8cbe62dd fix: excessive object storage handle creation in create_table (#2505)
This fixes two bugs with create_table storage handle reuse. First issue
is, the database object did not previously carry a session that
create_table operations could reuse for create_table operations.

Second issue is, the inheritance logic for create_table and open_table
was causing empty storage options (i.e Some({})) to get sent, instead of
None. Lance handles these differently:

* When None is set, the object store held in the session's storage
registry that was created at "connect" is used. This value stays in the
cache long-term (probably as long as the db reference is held).
* When Some({}) is sent, LanceDB will create a new connection and cache
it for an empty key. However, that cached value will remain valid only
as long as the client holds a reference to the table. After that, the
cache is poisoned and the next create_table with the same key, will
create a new connection. This confounds reuse if e.g python gc's the
table object before another table is created.

My feeling is that the second path, if intentional, is probably meant to
serve cases where tables are overriding settings and the cached
connection is assumed not to be generally applicable. The bug is we were
engaging that mechanism for all tables.
2025-07-17 16:27:23 -07:00
Ayush Chaurasia
f076bb41f4 feat: add support for returning all scores with rerankers (#2509)
Previously `return_score="all"` was supported only for the default
reranker (RRF) and not the model based rerankers.
This adds support for keeping all scores in the base reranker so that
all model based rerankers can use it. Its a slower path than keeping
just the relevance score but can be useful in debugging
2025-07-15 21:03:03 +05:30
BubbleCal
902fb83d54 fix: set_lance_version may miss features when upgrading lance (#2510)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-07-15 20:11:10 +08:00
BubbleCal
779118339f chore: upgrade lance to 0.31.2-beta.3 (#2508)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-07-15 17:08:11 +08:00
BubbleCal
03b62599d7 feat: support ngram tokenizer (#2507)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
2025-07-15 16:36:08 +08:00
Benjamin Schmidt
4c999fb651 chore: fix cleanupOlderThan docs (#2504)
Thanks for all your work.

The docstring for `OptimizeOptions ` seems to reference a non-existent
method on `Table`. I believe this is the correct example for
`cleanupOlderThan`.

This also appears in the generated docs, but I assume they live
downstream from this code?
2025-07-15 16:23:10 +08:00
Lance Release
6d23d32ab5 Bump version: 0.21.1-beta.2 → 0.21.1 2025-07-10 21:36:59 +00:00
Lance Release
704cec34e1 Bump version: 0.21.1-beta.1 → 0.21.1-beta.2 2025-07-10 21:36:26 +00:00
45 changed files with 577 additions and 218 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.21.1-beta.1"
current_version = "0.21.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

74
Cargo.lock generated
View File

@@ -2840,9 +2840,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2afa8e97cfde0f44698a13dc5afc0a29840eb0d252a918986b34dc5baa166d9a"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"rand 0.8.5",
]
@@ -3931,9 +3930,8 @@ dependencies = [
[[package]]
name = "lance"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8258e6fe0283f6a9764ffe520e3e2e7727b00a7f14f03852b167692608242f0e"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow",
"arrow-arith",
@@ -3995,9 +3993,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "135b0fdadc4ada6f8c382379a97f09f6a6186f05b33bd23743a959151a5cf233"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4014,9 +4011,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83eb8468fcdfe4b4eacbcb1c151a72ac032d9143f4203a7f3bfbde64214a40b3"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4051,9 +4047,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95a266374d53d37ee4378fbc3e4827931e1119e609bc41b64eb5a5ca93c8e8eb"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow",
"arrow-array",
@@ -4081,9 +4076,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3890c9f35c318969735387f154c7633ccd5039e6a9879b9372a4005e43182073"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow",
"arrow-array",
@@ -4099,9 +4093,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b68ab7867165826bf893dcff9e2ce565e6a0299862b7315a1c0ef1470a6c9144"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrayref",
"arrow",
@@ -4140,9 +4133,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc5c8e5b19054c74e0bf7fcfa6038bfec0f881209d66e04e41f4a4f2e0272317"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4176,9 +4168,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa06deb03ad01fb42790f525e60b297ff9011a6590f769f698626a8e5ea53350"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow",
"arrow-array",
@@ -4232,9 +4223,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac1794d9fe428e5a75fdef80e259b5d16af7ee5c95b866f1270c78368f206ed"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow",
"arrow-arith",
@@ -4272,11 +4262,11 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42bfda62395e6123de712a7fbd1a5db0c678e53a6f46f1c33d9729814106e7f1"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow-array",
"arrow-buffer",
"arrow-ord",
"arrow-schema",
"bitvec",
@@ -4296,9 +4286,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6919d1e0f24741a01b3ee65ee57f05e89472b75b6086874936e1f01e456f6f6f"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow",
"arrow-array",
@@ -4336,9 +4325,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0ed2c0ac602bdcc17836b8c2c74cbb8fff962570f2df40d6c1d4a540053de44"
version = "0.31.2"
source = "git+https://github.com/lancedb/lance.git?tag=v0.31.2-beta.3#6e987921d0efbe42c018047ea45d1b6f624d8280"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4349,7 +4337,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.21.1-beta.1"
version = "0.21.1"
dependencies = [
"arrow",
"arrow-array",
@@ -4436,7 +4424,7 @@ dependencies = [
[[package]]
name = "lancedb-node"
version = "0.21.1-beta.1"
version = "0.21.1"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4461,7 +4449,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.21.1-beta.1"
version = "0.21.1"
dependencies = [
"arrow-array",
"arrow-ipc",
@@ -4481,7 +4469,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.24.1-beta.1"
version = "0.24.1"
dependencies = [
"arrow",
"env_logger",

View File

@@ -21,14 +21,16 @@ categories = ["database-implementations"]
rust-version = "1.78.0"
[workspace.dependencies]
lance = { "version" = "=0.31.1", features = ["dynamodb"] }
lance-io = { "version" = "=0.31.1" }
lance-index = { "version" = "=0.31.1" }
lance-linalg = { "version" = "=0.31.1" }
lance-table = { "version" = "=0.31.1" }
lance-testing = { "version" = "=0.31.1" }
lance-datafusion = { "version" = "=0.31.1" }
lance-encoding = { "version" = "=0.31.1" }
lance = { "version" = "=0.31.2", "features" = [
"dynamodb",
], "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-io = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-index = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-linalg = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-table = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-testing = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-datafusion = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
lance-encoding = { "version" = "=0.31.2", "tag" = "v0.31.2-beta.3", "git" = "https://github.com/lancedb/lance.git" }
# Note that this one does not include pyarrow
arrow = { version = "55.1", optional = false }
arrow-array = "55.1"

View File

@@ -47,10 +47,10 @@ def extract_features(line: str) -> list:
"""
import re
match = re.search(r'"features"\s*=\s*\[(.*?)\]', line)
match = re.search(r'"features"\s*=\s*\[\s*(.*?)\s*\]', line, re.DOTALL)
if match:
features_str = match.group(1)
return [f.strip('"') for f in features_str.split(",")]
return [f.strip('"') for f in features_str.split(",") if len(f) > 0]
return []
@@ -63,10 +63,24 @@ def update_cargo_toml(line_updater):
lines = f.readlines()
new_lines = []
lance_line = ""
is_parsing_lance_line = False
for line in lines:
if line.startswith("lance"):
# Update the line using the provided function
new_lines.append(line_updater(line))
if line.strip().endswith("}"):
new_lines.append(line_updater(line))
else:
lance_line = line
is_parsing_lance_line = True
elif is_parsing_lance_line:
lance_line += line
if line.strip().endswith("}"):
new_lines.append(line_updater(lance_line))
lance_line = ""
is_parsing_lance_line = False
else:
print("doesn't end with }:", line)
else:
# Keep the line unchanged
new_lines.append(line)

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.21.1-beta.1</version>
<version>0.21.1-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.21.1-beta.1</version>
<version>0.21.1-final.0</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

49
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"cpu": [
"x64",
"arm64"
@@ -52,11 +52,11 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.21.1-beta.1",
"@lancedb/vectordb-darwin-x64": "0.21.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.21.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.21.1-beta.1"
"@lancedb/vectordb-darwin-arm64": "0.21.1",
"@lancedb/vectordb-darwin-x64": "0.21.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1",
"@lancedb/vectordb-linux-x64-gnu": "0.21.1",
"@lancedb/vectordb-win32-x64-msvc": "0.21.1"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -327,60 +327,65 @@
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.21.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.1-beta.1.tgz",
"integrity": "sha512-D9SOLFb/40E2/9bt82xOti3jogRAaR1UkT2LfGZJw/0wBu8d8/xKjWgfm3d26S5K6in6DWsX1njLxevrFqD5HA==",
"version": "0.21.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.21.1.tgz",
"integrity": "sha512-eXeOKgK5s7MSKDzA7Hl4/9E2X8tWWMNV7UJiFdwxrUcop86tM5ePBi8tApRnaQ3wBXrs99XTVBJ7+j+2gzilVA==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.21.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.1-beta.1.tgz",
"integrity": "sha512-JnZ41aDOJs6LWfI9t/+MnpqsK/Fj9r/hDdZSOjcQquLOcm2eP3NnvEnDvn+1pqWBN6ceqf1avTatPBGnD/yhNA==",
"version": "0.21.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.21.1.tgz",
"integrity": "sha512-vLoPWfg7OPw5vazLH5/YD/yQkZiTiPniuQgsH+xTodRfLf926lny53G7LQ6nFXNKIzX/jYKtg7AfMU8IcDLSEQ==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.21.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.1-beta.1.tgz",
"integrity": "sha512-Xnw0wYtnfzVUr4DzppJCSx+HZdAHr6sqMC8SdaYNQ9XEjBZE20n5SO2AdBYjejbmONJ7lpGs3ydnLIZ6N40dAQ==",
"version": "0.21.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.21.1.tgz",
"integrity": "sha512-IMAxtXj5aHCv9peziN77IxQpkYFj83KvI8zQCHzbMMXv7BspkhAd0PaUViqHqtTf2TUHjYQ66a7clZrEn+xQuQ==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.21.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.1-beta.1.tgz",
"integrity": "sha512-7S7gV13hv9Ho5W1Jat3FYiaMJOjRAwZOol7lKvOhU+sR/tJMEfZIOWAgymoqhAowbMtf+wwLoeKacfybXGET/w==",
"version": "0.21.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.21.1.tgz",
"integrity": "sha512-9oPOxBsYGngIhtC/oC+fQ9V0w9mgFuj2Wyler8f5UYQdiAutsTNyOUA+XjtcROjVZrZ5oUeIrvOQSte9BbpRTg==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.21.1-beta.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.1-beta.1.tgz",
"integrity": "sha512-w6fEQA9IquvJ/GUYfiawRQvvdFD6OU44UW9JWm+FoscUFzdLiV7qmH4QjYEeEXQD7ob83ikFaxXGPTksYXpNOA==",
"version": "0.21.1",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.21.1.tgz",
"integrity": "sha512-XqDXFLfdjNpDZ5jaqLerdx+sDU4YLuPK3VF4TowwcOlWDrUtI/L1lAyCaKxcyz1qE3VGuZvhNU89N5ioEICb4Q==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"win32"

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"description": " Serverless, low-latency vector database for AI applications",
"private": false,
"main": "dist/index.js",
@@ -89,10 +89,10 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.21.1-beta.1",
"@lancedb/vectordb-darwin-arm64": "0.21.1-beta.1",
"@lancedb/vectordb-linux-x64-gnu": "0.21.1-beta.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1-beta.1",
"@lancedb/vectordb-win32-x64-msvc": "0.21.1-beta.1"
"@lancedb/vectordb-darwin-x64": "0.21.1",
"@lancedb/vectordb-darwin-arm64": "0.21.1",
"@lancedb/vectordb-linux-x64-gnu": "0.21.1",
"@lancedb/vectordb-linux-arm64-gnu": "0.21.1",
"@lancedb/vectordb-win32-x64-msvc": "0.21.1"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.21.1-beta.1"
version = "0.21.1"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -1706,6 +1706,60 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
expect(mustNotResults.length).toBe(1);
});
test("full text search ngram", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
{ text: "lance database", vector: [0.4, 0.5, 0.6] },
{ text: "lance is cool", vector: [0.7, 0.8, 0.9] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts({ baseTokenizer: "ngram" }),
});
const results = await table.search("lan").toArray();
expect(results.length).toBe(2);
const resultSet = new Set(results.map((r) => r.text));
expect(resultSet.has("lance database")).toBe(true);
expect(resultSet.has("lance is cool")).toBe(true);
const results2 = await table.search("nce").toArray(); // spellchecker:disable-line
expect(results2.length).toBe(2);
const resultSet2 = new Set(results2.map((r) => r.text));
expect(resultSet2.has("lance database")).toBe(true);
expect(resultSet2.has("lance is cool")).toBe(true);
// the default min_ngram_length is 3, so "la" should not match
const results3 = await table.search("la").toArray();
expect(results3.length).toBe(0);
// test setting min_ngram_length and prefix_only
await table.createIndex("text", {
config: Index.fts({
baseTokenizer: "ngram",
ngramMinLength: 2,
prefixOnly: true,
}),
replace: true,
});
const results4 = await table.search("lan").toArray();
expect(results4.length).toBe(2);
const resultSet4 = new Set(results4.map((r) => r.text));
expect(resultSet4.has("lance database")).toBe(true);
expect(resultSet4.has("lance is cool")).toBe(true);
const results5 = await table.search("nce").toArray(); // spellchecker:disable-line
expect(results5.length).toBe(0);
const results6 = await table.search("la").toArray();
expect(results6.length).toBe(2);
const resultSet6 = new Set(results6.map((r) => r.text));
expect(resultSet6.has("lance database")).toBe(true);
expect(resultSet6.has("lance is cool")).toBe(true);
});
test.each([
[0.4, 0.5, 0.599], // number[]
Float32Array.of(0.4, 0.5, 0.599), // Float32Array

View File

@@ -439,7 +439,7 @@ export interface FtsOptions {
*
* "raw" - Raw tokenizer. This tokenizer does not split the text into tokens and indexes the entire text as a single token.
*/
baseTokenizer?: "simple" | "whitespace" | "raw";
baseTokenizer?: "simple" | "whitespace" | "raw" | "ngram";
/**
* language for stemming and stop words
@@ -472,6 +472,21 @@ export interface FtsOptions {
* whether to remove punctuation
*/
asciiFolding?: boolean;
/**
* ngram min length
*/
ngramMinLength?: number;
/**
* ngram max length
*/
ngramMaxLength?: number;
/**
* whether to only index the prefix of the token for ngram tokenizer
*/
prefixOnly?: boolean;
}
export class Index {
@@ -608,6 +623,9 @@ export class Index {
options?.stem,
options?.removeStopWords,
options?.asciiFolding,
options?.ngramMinLength,
options?.ngramMaxLength,
options?.prefixOnly,
),
);
}

View File

@@ -75,10 +75,10 @@ export interface OptimizeOptions {
* // Delete all versions older than 1 day
* const olderThan = new Date();
* olderThan.setDate(olderThan.getDate() - 1));
* tbl.cleanupOlderVersions(olderThan);
* tbl.optimize({cleanupOlderThan: olderThan});
*
* // Delete all versions except the current version
* tbl.cleanupOlderVersions(new Date());
* tbl.optimize({cleanupOlderThan: new Date()});
*/
cleanupOlderThan: Date;
deleteUnverified: boolean;

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.21.1-beta.1",
"version": "0.21.1",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.21.1-beta.1",
"version": "0.21.1",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -123,6 +123,9 @@ impl Index {
stem: Option<bool>,
remove_stop_words: Option<bool>,
ascii_folding: Option<bool>,
ngram_min_length: Option<u32>,
ngram_max_length: Option<u32>,
prefix_only: Option<bool>,
) -> Self {
let mut opts = FtsIndexBuilder::default();
if let Some(with_position) = with_position {
@@ -149,6 +152,15 @@ impl Index {
if let Some(ascii_folding) = ascii_folding {
opts = opts.ascii_folding(ascii_folding);
}
if let Some(ngram_min_length) = ngram_min_length {
opts = opts.ngram_min_length(ngram_min_length);
}
if let Some(ngram_max_length) = ngram_max_length {
opts = opts.ngram_max_length(ngram_max_length);
}
if let Some(prefix_only) = prefix_only {
opts = opts.ngram_prefix_only(prefix_only);
}
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.24.1"
current_version = "0.24.2-beta.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.24.1"
version = "0.24.2-beta.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -137,6 +137,9 @@ class FTS:
stem: bool = True
remove_stop_words: bool = True
ascii_folding: bool = True
ngram_min_length: int = 3
ngram_max_length: int = 3
prefix_only: bool = False
@dataclass

View File

@@ -1374,6 +1374,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
if query_string is not None and not isinstance(query_string, str):
raise ValueError("Reranking currently only supports string queries")
self._str_query = query_string if query_string is not None else self._str_query
if reranker.score == "all":
self.with_row_id(True)
return self
def bypass_vector_index(self) -> LanceVectorQueryBuilder:
@@ -1569,6 +1571,8 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
The LanceQueryBuilder object.
"""
self._reranker = reranker
if reranker.score == "all":
self.with_row_id(True)
return self
@@ -1845,6 +1849,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._norm = normalize
self._reranker = reranker
if reranker.score == "all":
self.with_row_id(True)
return self

View File

@@ -158,6 +158,9 @@ class RemoteTable(Table):
stem: bool = True,
remove_stop_words: bool = True,
ascii_folding: bool = True,
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
):
config = FTS(
with_position=with_position,
@@ -168,6 +171,9 @@ class RemoteTable(Table):
stem=stem,
remove_stop_words=remove_stop_words,
ascii_folding=ascii_folding,
ngram_min_length=ngram_min_length,
ngram_max_length=ngram_max_length,
prefix_only=prefix_only,
)
LOOP.run(
self._table.create_index(

View File

@@ -74,9 +74,7 @@ class AnswerdotaiRerankers(Reranker):
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
elif self.score == "all":
raise NotImplementedError(
"Answerdotai Reranker does not support score='all' yet"
)
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
combined_results = combined_results.sort_by(
[("_relevance_score", "descending")]
)

View File

@@ -232,6 +232,39 @@ class Reranker(ABC):
return deduped_table
def _merge_and_keep_scores(self, vector_results: pa.Table, fts_results: pa.Table):
"""
Merge the results from the vector and FTS search and keep the scores.
This op is slower than just keeping relevance score but can be useful
for debugging.
"""
# add nulls to fts results for _distance
if "_distance" not in fts_results.column_names:
fts_results = fts_results.append_column(
"_distance",
pa.array([None] * len(fts_results), type=pa.float32()),
)
# add nulls to vector results for _score
if "_score" not in vector_results.column_names:
vector_results = vector_results.append_column(
"_score",
pa.array([None] * len(vector_results), type=pa.float32()),
)
# combine them and fill the scores
vector_results_dict = {row["_rowid"]: row for row in vector_results.to_pylist()}
fts_results_dict = {row["_rowid"]: row for row in fts_results.to_pylist()}
# merge them into vector_results
for key, value in fts_results_dict.items():
if key in vector_results_dict:
vector_results_dict[key]["_score"] = value["_score"]
else:
vector_results_dict[key] = value
combined = pa.Table.from_pylist(list(vector_results_dict.values()))
return combined
def _keep_relevance_score(self, combined_results: pa.Table):
if self.score == "relevance":
if "_score" in combined_results.column_names:

View File

@@ -92,14 +92,14 @@ class CohereReranker(Reranker):
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results)
if self.score == "all":
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
else:
combined_results = self.merge_results(vector_results, fts_results)
combined_results = self._rerank(combined_results, query)
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
elif self.score == "all":
raise NotImplementedError(
"return_score='all' not implemented for cohere reranker"
)
return combined_results
def rerank_vector(self, query: str, vector_results: pa.Table):

View File

@@ -81,15 +81,15 @@ class CrossEncoderReranker(Reranker):
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results)
if self.score == "all":
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
else:
combined_results = self.merge_results(vector_results, fts_results)
combined_results = self._rerank(combined_results, query)
# sort the results by _score
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
elif self.score == "all":
raise NotImplementedError(
"return_score='all' not implemented for CrossEncoderReranker"
)
combined_results = combined_results.sort_by(
[("_relevance_score", "descending")]
)

View File

@@ -97,14 +97,14 @@ class JinaReranker(Reranker):
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results)
if self.score == "all":
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
else:
combined_results = self.merge_results(vector_results, fts_results)
combined_results = self._rerank(combined_results, query)
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
elif self.score == "all":
raise NotImplementedError(
"return_score='all' not implemented for JinaReranker"
)
return combined_results
def rerank_vector(self, query: str, vector_results: pa.Table):

View File

@@ -88,14 +88,13 @@ class OpenaiReranker(Reranker):
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results)
if self.score == "all":
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
else:
combined_results = self.merge_results(vector_results, fts_results)
combined_results = self._rerank(combined_results, query)
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
elif self.score == "all":
raise NotImplementedError(
"OpenAI Reranker does not support score='all' yet"
)
combined_results = combined_results.sort_by(
[("_relevance_score", "descending")]

View File

@@ -94,14 +94,14 @@ class VoyageAIReranker(Reranker):
vector_results: pa.Table,
fts_results: pa.Table,
):
combined_results = self.merge_results(vector_results, fts_results)
if self.score == "all":
combined_results = self._merge_and_keep_scores(vector_results, fts_results)
else:
combined_results = self.merge_results(vector_results, fts_results)
combined_results = self._rerank(combined_results, query)
if self.score == "relevance":
combined_results = self._keep_relevance_score(combined_results)
elif self.score == "all":
raise NotImplementedError(
"return_score='all' not implemented for voyageai reranker"
)
return combined_results
def rerank_vector(self, query: str, vector_results: pa.Table):

View File

@@ -838,6 +838,9 @@ class Table(ABC):
stem: bool = True,
remove_stop_words: bool = True,
ascii_folding: bool = True,
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
wait_timeout: Optional[timedelta] = None,
):
"""Create a full-text search index on the table.
@@ -877,6 +880,7 @@ class Table(ABC):
- "simple": Splits text by whitespace and punctuation.
- "whitespace": Split text by whitespace, but not punctuation.
- "raw": No tokenization. The entire text is treated as a single token.
- "ngram": N-Gram tokenizer.
language : str, default "English"
The language to use for tokenization.
max_token_length : int, default 40
@@ -894,6 +898,12 @@ class Table(ABC):
ascii_folding : bool, default True
Whether to fold ASCII characters. This converts accented characters to
their ASCII equivalent. For example, "café" would be converted to "cafe".
ngram_min_length: int, default 3
The minimum length of an n-gram.
ngram_max_length: int, default 3
The maximum length of an n-gram.
prefix_only: bool, default False
Whether to only index the prefix of the token for ngram tokenizer.
wait_timeout: timedelta, optional
The timeout to wait if indexing is asynchronous.
"""
@@ -1981,6 +1991,9 @@ class LanceTable(Table):
stem: bool = True,
remove_stop_words: bool = True,
ascii_folding: bool = True,
ngram_min_length: int = 3,
ngram_max_length: int = 3,
prefix_only: bool = False,
):
if not use_tantivy:
if not isinstance(field_names, str):
@@ -1996,6 +2009,9 @@ class LanceTable(Table):
"stem": stem,
"remove_stop_words": remove_stop_words,
"ascii_folding": ascii_folding,
"ngram_min_length": ngram_min_length,
"ngram_max_length": ngram_max_length,
"prefix_only": prefix_only,
}
else:
tokenizer_configs = self.infer_tokenizer_configs(tokenizer_name)
@@ -2065,6 +2081,9 @@ class LanceTable(Table):
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
elif tokenizer_name == "raw":
return {
@@ -2075,6 +2094,9 @@ class LanceTable(Table):
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
elif tokenizer_name == "whitespace":
return {
@@ -2085,6 +2107,9 @@ class LanceTable(Table):
"stem": False,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
# or it's with language stemming with pattern like "en_stem"
@@ -2103,6 +2128,9 @@ class LanceTable(Table):
"stem": True,
"remove_stop_words": False,
"ascii_folding": False,
"ngram_min_length": 3,
"ngram_max_length": 3,
"prefix_only": False,
}
def add(

View File

@@ -25,4 +25,4 @@ IndexType = Literal[
]
# Tokenizer literals
BaseTokenizerType = Literal["simple", "raw", "whitespace"]
BaseTokenizerType = Literal["simple", "raw", "whitespace", "ngram"]

View File

@@ -669,3 +669,46 @@ def test_fts_on_list(mem_db: DBConnection):
res = table.search(PhraseQuery("lance database", "text")).limit(5).to_list()
assert len(res) == 2
def test_fts_ngram(mem_db: DBConnection):
data = pa.table({"text": ["hello world", "lance database", "lance is cool"]})
table = mem_db.create_table("test", data=data)
table.create_fts_index("text", use_tantivy=False, base_tokenizer="ngram")
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
results = (
table.search("nce", query_type="fts").limit(10).to_list()
) # spellchecker:disable-line
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
# the default min_ngram_length is 3, so "la" should not match
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 0
# test setting min_ngram_length and prefix_only
table.create_fts_index(
"text",
use_tantivy=False,
base_tokenizer="ngram",
replace=True,
ngram_min_length=2,
prefix_only=True,
)
results = table.search("lan", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}
results = (
table.search("nce", query_type="fts").limit(10).to_list()
) # spellchecker:disable-line
assert len(results) == 0
results = table.search("la", query_type="fts").limit(10).to_list()
assert len(results) == 2
assert set(r["text"] for r in results) == {"lance database", "lance is cool"}

View File

@@ -272,7 +272,9 @@ async def test_distance_range_with_new_rows_async():
# append more rows so that execution plan would be mixed with ANN & Flat KNN
new_data = pa.table(
{
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(np.random.rand(4, 2)),
"vector": pa.FixedShapeTensorArray.from_numpy_ndarray(
np.random.rand(4, 2) + 1
),
}
)
await table.add(new_data)

View File

@@ -499,3 +499,19 @@ def test_empty_result_reranker():
.rerank(reranker)
.to_arrow()
)
@pytest.mark.parametrize("use_tantivy", [True, False])
def test_cross_encoder_reranker_return_all(tmp_path, use_tantivy):
pytest.importorskip("sentence_transformers")
reranker = CrossEncoderReranker(return_score="all")
table, schema = get_test_table(tmp_path, use_tantivy)
query = "single player experience"
result = (
table.search(query, query_type="hybrid", vector_column_name="vector")
.rerank(reranker=reranker)
.to_arrow()
)
assert "_relevance_score" in result.column_names
assert "_score" in result.column_names
assert "_distance" in result.column_names

View File

@@ -47,7 +47,10 @@ pub fn extract_index_params(source: &Option<Bound<'_, PyAny>>) -> PyResult<Lance
.max_token_length(params.max_token_length)
.remove_stop_words(params.remove_stop_words)
.stem(params.stem)
.ascii_folding(params.ascii_folding);
.ascii_folding(params.ascii_folding)
.ngram_min_length(params.ngram_min_length)
.ngram_max_length(params.ngram_max_length)
.ngram_prefix_only(params.prefix_only);
Ok(LanceDbIndex::FTS(inner_opts))
},
"IvfFlat" => {
@@ -130,6 +133,9 @@ struct FtsParams {
stem: bool,
remove_stop_words: bool,
ascii_folding: bool,
ngram_min_length: u32,
ngram_max_length: u32,
prefix_only: bool,
}
#[derive(FromPyObject)]

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.21.1-beta.1"
version = "0.21.1"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.21.1-beta.1"
version = "0.21.1"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -8,7 +8,7 @@ use std::path::Path;
use std::{collections::HashMap, sync::Arc};
use lance::dataset::{ReadParams, WriteMode};
use lance::io::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore};
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
use lance_datafusion::utils::StreamingWriteSource;
use lance_encoding::version::LanceFileVersion;
use lance_table::io::commit::commit_handler_from_url;
@@ -217,6 +217,9 @@ pub struct ListingDatabase {
// Options for tables created by this connection
new_table_config: NewTableConfig,
// Session for object stores and caching
session: Arc<lance::session::Session>,
}
impl std::fmt::Display for ListingDatabase {
@@ -313,13 +316,17 @@ impl ListingDatabase {
let plain_uri = url.to_string();
let registry = Arc::new(ObjectStoreRegistry::default());
let session = Arc::new(lance::session::Session::default());
let os_params = ObjectStoreParams {
storage_options: Some(options.storage_options.clone()),
..Default::default()
};
let (object_store, base_path) =
ObjectStore::from_uri_and_params(registry, &plain_uri, &os_params).await?;
let (object_store, base_path) = ObjectStore::from_uri_and_params(
session.store_registry(),
&plain_uri,
&os_params,
)
.await?;
if object_store.is_local() {
Self::try_create_dir(&plain_uri).context(CreateDirSnafu { path: plain_uri })?;
}
@@ -342,6 +349,7 @@ impl ListingDatabase {
read_consistency_interval: request.read_consistency_interval,
storage_options: options.storage_options,
new_table_config: options.new_table_config,
session,
})
}
Err(_) => {
@@ -360,7 +368,13 @@ impl ListingDatabase {
read_consistency_interval: Option<std::time::Duration>,
new_table_config: NewTableConfig,
) -> Result<Self> {
let (object_store, base_path) = ObjectStore::from_uri(path).await?;
let session = Arc::new(lance::session::Session::default());
let (object_store, base_path) = ObjectStore::from_uri_and_params(
session.store_registry(),
path,
&ObjectStoreParams::default(),
)
.await?;
if object_store.is_local() {
Self::try_create_dir(path).context(CreateDirSnafu { path })?;
}
@@ -374,6 +388,7 @@ impl ListingDatabase {
read_consistency_interval,
storage_options: HashMap::new(),
new_table_config,
session,
})
}
@@ -441,6 +456,128 @@ impl ListingDatabase {
}
Ok(())
}
/// Inherit storage options from the connection into the target map
fn inherit_storage_options(&self, target: &mut HashMap<String, String>) {
for (key, value) in self.storage_options.iter() {
if !target.contains_key(key) {
target.insert(key.clone(), value.clone());
}
}
}
/// Extract storage option overrides from the request
fn extract_storage_overrides(
&self,
request: &CreateTableRequest,
) -> Result<(Option<LanceFileVersion>, Option<bool>)> {
let storage_options = request
.write_options
.lance_write_params
.as_ref()
.and_then(|p| p.store_params.as_ref())
.and_then(|sp| sp.storage_options.as_ref());
let storage_version_override = storage_options
.and_then(|opts| opts.get(OPT_NEW_TABLE_STORAGE_VERSION))
.map(|s| s.parse::<LanceFileVersion>())
.transpose()?;
let v2_manifest_override = storage_options
.and_then(|opts| opts.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS))
.map(|s| s.parse::<bool>())
.transpose()
.map_err(|_| Error::InvalidInput {
message: "enable_v2_manifest_paths must be a boolean".to_string(),
})?;
Ok((storage_version_override, v2_manifest_override))
}
/// Prepare write parameters for table creation
fn prepare_write_params(
&self,
request: &CreateTableRequest,
storage_version_override: Option<LanceFileVersion>,
v2_manifest_override: Option<bool>,
) -> lance::dataset::WriteParams {
let mut write_params = request
.write_options
.lance_write_params
.clone()
.unwrap_or_default();
// Only modify the storage options if we actually have something to
// inherit. There is a difference between storage_options=None and
// storage_options=Some({}). Using storage_options=None will cause the
// connection's session store registry to be used. Supplying Some({})
// will cause a new connection to be created, and that connection will
// be dropped from the cache when python GCs the table object, which
// confounds reuse across tables.
if !self.storage_options.is_empty() {
let storage_options = write_params
.store_params
.get_or_insert_with(Default::default)
.storage_options
.get_or_insert_with(Default::default);
self.inherit_storage_options(storage_options);
}
write_params.data_storage_version = self
.new_table_config
.data_storage_version
.or(storage_version_override);
if let Some(enable_v2_manifest_paths) = self
.new_table_config
.enable_v2_manifest_paths
.or(v2_manifest_override)
{
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
}
if matches!(&request.mode, CreateTableMode::Overwrite) {
write_params.mode = WriteMode::Overwrite;
}
write_params.session = Some(self.session.clone());
write_params
}
/// Handle the case where table already exists based on the create mode
async fn handle_table_exists(
&self,
table_name: &str,
mode: CreateTableMode,
data_schema: &arrow_schema::Schema,
) -> Result<Arc<dyn BaseTable>> {
match mode {
CreateTableMode::Create => Err(Error::TableAlreadyExists {
name: table_name.to_string(),
}),
CreateTableMode::ExistOk(callback) => {
let req = OpenTableRequest {
name: table_name.to_string(),
index_cache_size: None,
lance_read_params: None,
};
let req = (callback)(req);
let table = self.open_table(req).await?;
let table_schema = table.schema().await?;
if table_schema.as_ref() != data_schema {
return Err(Error::Schema {
message: "Provided schema does not match existing table schema".to_string(),
});
}
Ok(table)
}
CreateTableMode::Overwrite => unreachable!(),
}
}
}
#[async_trait::async_trait]
@@ -475,50 +612,14 @@ impl Database for ListingDatabase {
Ok(f)
}
async fn create_table(&self, mut request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>> {
let table_uri = self.table_uri(&request.name)?;
// Inherit storage options from the connection
let storage_options = request
.write_options
.lance_write_params
.get_or_insert_with(Default::default)
.store_params
.get_or_insert_with(Default::default)
.storage_options
.get_or_insert_with(Default::default);
for (key, value) in self.storage_options.iter() {
if !storage_options.contains_key(key) {
storage_options.insert(key.clone(), value.clone());
}
}
let storage_options = storage_options.clone();
let (storage_version_override, v2_manifest_override) =
self.extract_storage_overrides(&request)?;
let mut write_params = request.write_options.lance_write_params.unwrap_or_default();
if let Some(storage_version) = &self.new_table_config.data_storage_version {
write_params.data_storage_version = Some(*storage_version);
} else {
// Allow the user to override the storage version via storage options (backwards compatibility)
if let Some(data_storage_version) = storage_options.get(OPT_NEW_TABLE_STORAGE_VERSION) {
write_params.data_storage_version = Some(data_storage_version.parse()?);
}
}
if let Some(enable_v2_manifest_paths) = self.new_table_config.enable_v2_manifest_paths {
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
} else {
// Allow the user to override the storage version via storage options (backwards compatibility)
if let Some(enable_v2_manifest_paths) = storage_options
.get(OPT_NEW_TABLE_V2_MANIFEST_PATHS)
.map(|s| s.parse::<bool>().unwrap())
{
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
}
}
if matches!(&request.mode, CreateTableMode::Overwrite) {
write_params.mode = WriteMode::Overwrite;
}
let write_params =
self.prepare_write_params(&request, storage_version_override, v2_manifest_override);
let data_schema = request.data.arrow_schema();
@@ -533,30 +634,10 @@ impl Database for ListingDatabase {
.await
{
Ok(table) => Ok(Arc::new(table)),
Err(Error::TableAlreadyExists { name }) => match request.mode {
CreateTableMode::Create => Err(Error::TableAlreadyExists { name }),
CreateTableMode::ExistOk(callback) => {
let req = OpenTableRequest {
name: request.name.clone(),
index_cache_size: None,
lance_read_params: None,
};
let req = (callback)(req);
let table = self.open_table(req).await?;
let table_schema = table.schema().await?;
if table_schema != data_schema {
return Err(Error::Schema {
message: "Provided schema does not match existing table schema"
.to_string(),
});
}
Ok(table)
}
CreateTableMode::Overwrite => unreachable!(),
},
Err(Error::TableAlreadyExists { .. }) => {
self.handle_table_exists(&request.name, request.mode, &data_schema)
.await
}
Err(err) => Err(err),
}
}
@@ -564,18 +645,22 @@ impl Database for ListingDatabase {
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
let table_uri = self.table_uri(&request.name)?;
// Inherit storage options from the connection
let storage_options = request
.lance_read_params
.get_or_insert_with(Default::default)
.store_options
.get_or_insert_with(Default::default)
.storage_options
.get_or_insert_with(Default::default);
for (key, value) in self.storage_options.iter() {
if !storage_options.contains_key(key) {
storage_options.insert(key.clone(), value.clone());
}
// Only modify the storage options if we actually have something to
// inherit. There is a difference between storage_options=None and
// storage_options=Some({}). Using storage_options=None will cause the
// connection's session store registry to be used. Supplying Some({})
// will cause a new connection to be created, and that connection will
// be dropped from the cache when python GCs the table object, which
// confounds reuse across tables.
if !self.storage_options.is_empty() {
let storage_options = request
.lance_read_params
.get_or_insert_with(Default::default)
.store_options
.get_or_insert_with(Default::default)
.storage_options
.get_or_insert_with(Default::default);
self.inherit_storage_options(storage_options);
}
// Some ReadParams are exposed in the OpenTableBuilder, but we also
@@ -584,13 +669,14 @@ impl Database for ListingDatabase {
// If we have a user provided ReadParams use that
// If we don't then start with the default ReadParams and customize it with
// the options from the OpenTableBuilder
let read_params = request.lance_read_params.unwrap_or_else(|| {
let mut read_params = request.lance_read_params.unwrap_or_else(|| {
let mut default_params = ReadParams::default();
if let Some(index_cache_size) = request.index_cache_size {
default_params.index_cache_size = index_cache_size as usize;
}
default_params
});
read_params.session(self.session.clone());
let native_table = Arc::new(
NativeTable::open_with_params(

View File

@@ -281,6 +281,46 @@ async fn test_encryption() -> Result<()> {
Ok(())
}
#[tokio::test]
async fn test_table_storage_options_override() -> Result<()> {
// Test that table-level storage options override connection-level options
let bucket = S3Bucket::new("test-override").await;
let key1 = KMSKey::new().await;
let key2 = KMSKey::new().await;
let uri = format!("s3://{}", bucket.0);
// Create connection with key1 encryption
let db = lancedb::connect(&uri)
.storage_options(CONFIG.iter().cloned())
.storage_option("aws_server_side_encryption", "aws:kms")
.storage_option("aws_sse_kms_key_id", &key1.0)
.execute()
.await?;
// Create table overriding with key2 encryption
let data = test_data();
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
let _table = db
.create_table("test_override", data)
.storage_option("aws_sse_kms_key_id", &key2.0)
.execute()
.await?;
// Verify objects are encrypted with key2, not key1
validate_objects_encrypted(&bucket.0, "test_override", &key2.0).await;
// Also test that a table created without override uses connection settings
let data = test_data();
let data = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema());
let _table2 = db.create_table("test_inherit", data).execute().await?;
// Verify this table uses key1 from connection
validate_objects_encrypted(&bucket.0, "test_inherit", &key1.0).await;
Ok(())
}
struct DynamoDBCommitTable(String);
impl DynamoDBCommitTable {