Compare commits

..

8 Commits

Author SHA1 Message Date
dependabot[bot]
d145d594bb chore(deps): bump the rust-minor-patch group across 1 directory with 2 updates
Bumps the rust-minor-patch group with 2 updates in the / directory: [serde_json](https://github.com/serde-rs/json) and [aws-smithy-runtime](https://github.com/smithy-lang/smithy-rs).


Updates `serde_json` from 1.0.149 to 1.0.150
- [Release notes](https://github.com/serde-rs/json/releases)
- [Commits](https://github.com/serde-rs/json/compare/v1.0.149...v1.0.150)

Updates `aws-smithy-runtime` from 1.11.1 to 1.11.3
- [Release notes](https://github.com/smithy-lang/smithy-rs/releases)
- [Changelog](https://github.com/smithy-lang/smithy-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/smithy-lang/smithy-rs/commits)

---
updated-dependencies:
- dependency-name: serde_json
  dependency-version: 1.0.150
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: rust-minor-patch
- dependency-name: aws-smithy-runtime
  dependency-version: 1.11.3
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: rust-minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-05-23 14:58:50 +00:00
Will Jones
ccec91d957 fix: use releases API in check_lance_release.py (#3427)
Previously `check_lance_release.py` used `git/refs/tags` with
`--paginate --jq`, which drops the last page in some `gh` versions. The
7.x Lance tags all landed on the final (partial) page, causing the
script to report `v6.0.1` as the latest and never triggering an update.

Switch to the releases API with `per_page=20`, which returns the 20 most
recent releases sorted newest-first — one API call, no pagination
needed.

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-22 15:00:44 -07:00
Zhaocun Sun
ec82e36317 docs(python): document in-memory connections (#3434)
## Problem

Issue #2247 notes that the Python docs do not show how to use LanceDB's
in-memory backend via `connect("memory://")`.

## Solution

Add `memory://` examples to the sync and async `connect` docstrings, and
call out that in-memory databases are intended for tests/temporary data
and are not persisted.

## Validation

- `python3 -m py_compile python/python/lancedb/__init__.py`
- `git diff --check`

## Confidence

82/100 — docs-only update, directly tied to the documented missing
`memory://` usage. It changes API documentation only and was syntax/diff
validated.

Closes #2247.
2026-05-22 10:51:09 -07:00
Will Jones
da2a1c4a2c test(rust): fix flaky env-var-dependent client tests (#3426)
The `test_resolve_user_id_*` tests in `remote/client.rs` mutate the
process-global `LANCEDB_USER_ID` and `LANCEDB_USER_ID_ENV_KEY`
environment variables. cargo runs tests in a binary across multiple
threads, so one test's `remove_var` can race another's `set_var` between
when it's set and when `resolve_user_id()` reads it.

This surfaced as an intermittent failure of
`test_resolve_user_id_from_env_key` on Windows CI:

```
assertion `left == right` failed
  left: None
 right: Some("custom-env-user-id")
```

Annotates the five env-mutating tests with `serial_test`'s
`#[serial(user_id_env)]` so they run serially with respect to each
other.

Should be backported to `release/v0.28` (CI for #3421 hit this same
flake).

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 10:35:15 -07:00
Xuanwo
8463a10ebe docs: clarify PR title requirement for agents (#3433) 2026-05-22 20:09:20 +08:00
Lance Release
7168d64af1 Bump version: 0.30.0-beta.0 → 0.30.0-beta.1 2026-05-22 10:09:01 +00:00
Lance Release
403c33dff0 Bump version: 0.33.0-beta.0 → 0.33.0-beta.1 2026-05-22 10:08:07 +00:00
Xuanwo
a0001043b6 fix: canonicalize remote nested field paths (#3430)
Fixes #3407.

Remote tables now resolve create-index field paths against the table
schema before sending requests, so nested, escaped, and case-insensitive
inputs use the same canonical path contract as local tables. Remote
`list_indices()` also canonicalizes returned columns against the current
schema, and the remote query tests lock explicit nested vector and FTS
request payloads.
2026-05-22 15:23:00 +08:00
34 changed files with 546 additions and 181 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.30.0-beta.0"
current_version = "0.30.0-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -37,10 +37,13 @@ Before committing changes, run formatting for every language you touched. At min
and run targeted tests through `cd python && uv run ...`.
* TypeScript changes: run the relevant `npm`/`pnpm` lint, format, build, and docs commands in `nodejs`.
Before creating a PR, make sure the PR title follows Conventional Commits, such as
`fix: support nested field paths in native index creation` or
`feat(python): add dataset multiprocessing support`. The semantic-release check uses the
PR title and body as the merge commit message, so a non-conventional PR title will fail CI.
Before creating a PR, the exact value passed to `gh pr create --title` must follow
Conventional Commits, such as `fix: support nested field paths in native index creation`
or `feat(python): add dataset multiprocessing support`. Do not use a plain natural
language summary like `Support nested field paths in native index creation` as the PR
title. The semantic-release check uses the PR title and body as the merge commit message,
so a non-conventional PR title will fail CI. After creating a PR, read the remote PR title
back and fix it immediately if it is not conventional.
## Coding tips

152
Cargo.lock generated
View File

@@ -976,15 +976,16 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.11.1"
version = "1.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0504b1ab12debb5959e5165ee5fe97dd387e7aa7ea6a477bfd7635dfe769a4f5"
checksum = "b8e6f5caf6fea86f8c2206541ab5857cfcda9013426cdbe8fa0098b9e2d32182"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
"aws-smithy-http-client",
"aws-smithy-observability",
"aws-smithy-runtime-api",
"aws-smithy-schema",
"aws-smithy-types",
"bytes",
"fastrand",
@@ -1001,9 +1002,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime-api"
version = "1.12.0"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71a13df6ada0aafbf21a73bdfcdf9324cfa9df77d96b8446045be3cde61b42e"
checksum = "dc117c179ecf39a62a0a3f49f600e9ac26a7ad7dd172177999f83933af776c32"
dependencies = [
"aws-smithy-async",
"aws-smithy-runtime-api-macros",
@@ -1029,10 +1030,21 @@ dependencies = [
]
[[package]]
name = "aws-smithy-types"
version = "1.4.7"
name = "aws-smithy-schema"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c"
checksum = "7442cb268338f0eb8278140a107c046756aa01093d8ef5e99628d34ae09c94f5"
dependencies = [
"aws-smithy-runtime-api",
"aws-smithy-types",
"http 1.4.0",
]
[[package]]
name = "aws-smithy-types"
version = "1.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "056b66dbce2f81cc0c1e2b05bb402eb58f8a3530479d650efadd5bbae9a4050b"
dependencies = [
"base64-simd",
"bytes",
@@ -3284,8 +3296,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-array",
"rand 0.9.4",
@@ -4506,8 +4518,8 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
[[package]]
name = "lance"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arc-swap",
"arrow",
@@ -4525,7 +4537,6 @@ dependencies = [
"async_cell",
"aws-credential-types",
"aws-sdk-dynamodb",
"bitpacking",
"byteorder",
"bytes",
"chrono",
@@ -4555,7 +4566,6 @@ dependencies = [
"lance-table",
"lance-tokenizer",
"log",
"moka",
"object_store",
"permutation",
"pin-project",
@@ -4579,8 +4589,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4600,8 +4610,8 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrayref",
"paste",
@@ -4610,8 +4620,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4646,8 +4656,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow",
"arrow-array",
@@ -4677,8 +4687,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow",
"arrow-array",
@@ -4696,8 +4706,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4732,8 +4742,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4764,8 +4774,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arc-swap",
"arrow",
@@ -4829,8 +4839,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow",
"arrow-arith",
@@ -4872,8 +4882,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4889,8 +4899,8 @@ dependencies = [
[[package]]
name = "lance-namespace"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow",
"async-trait",
@@ -4902,8 +4912,8 @@ dependencies = [
[[package]]
name = "lance-namespace-impls"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow",
"arrow-ipc",
@@ -4938,9 +4948,9 @@ dependencies = [
[[package]]
name = "lance-namespace-reqwest-client"
version = "0.7.7"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6369eee4682fb11edf538388b43c61ce288b8302fe89bb40944d7daa7faaae99"
checksum = "f65e31bdaa13e01dab6e7cf566da31df243c34a542f0d915d3601ec0e01e61d2"
dependencies = [
"reqwest 0.12.28",
"serde",
@@ -4952,8 +4962,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow",
"arrow-array",
@@ -4992,8 +5002,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -5004,8 +5014,8 @@ dependencies = [
[[package]]
name = "lance-tokenizer"
version = "7.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-rc.1#06f52e901dd9bd99d299263130fb2bda0a3f91af"
version = "7.0.0-beta.13"
source = "git+https://github.com/lance-format/lance.git?tag=v7.0.0-beta.13#929166e3ff51ed61b1fa42de2c63feaf51967ea1"
dependencies = [
"jieba-rs",
"lindera",
@@ -5016,7 +5026,7 @@ dependencies = [
[[package]]
name = "lancedb"
version = "0.30.0-beta.0"
version = "0.30.0-beta.1"
dependencies = [
"ahash",
"anyhow",
@@ -5086,6 +5096,7 @@ dependencies = [
"serde",
"serde_json",
"serde_with",
"serial_test",
"snafu 0.8.9",
"tempfile",
"test-log",
@@ -5098,7 +5109,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.30.0-beta.0"
version = "0.30.0-beta.1"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -5121,7 +5132,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.33.0-beta.0"
version = "0.33.0-beta.1"
dependencies = [
"arrow",
"async-trait",
@@ -8130,6 +8141,15 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "scc"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc"
dependencies = [
"sdd",
]
[[package]]
name = "schannel"
version = "0.1.29"
@@ -8196,6 +8216,12 @@ dependencies = [
"untrusted 0.9.0",
]
[[package]]
name = "sdd"
version = "3.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca"
[[package]]
name = "sec1"
version = "0.3.0"
@@ -8287,9 +8313,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.149"
version = "1.0.150"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
dependencies = [
"itoa",
"memchr",
@@ -8386,6 +8412,32 @@ dependencies = [
"unsafe-libyaml",
]
[[package]]
name = "serial_test"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f"
dependencies = [
"futures-executor",
"futures-util",
"log",
"once_cell",
"parking_lot",
"scc",
"serial_test_derive",
]
[[package]]
name = "serial_test_derive"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "sha1"
version = "0.10.6"

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-rc.1", default-features = false, "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-rc.1", "tag" = "v7.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=7.0.0-beta.13", default-features = false, "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=7.0.0-beta.13", "tag" = "v7.0.0-beta.13", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }

View File

@@ -112,25 +112,25 @@ def fetch_remote_tags() -> List[TagInfo]:
"api",
"-X",
"GET",
f"repos/{LANCE_REPO}/git/refs/tags",
"--paginate",
f"repos/{LANCE_REPO}/releases",
"--jq",
".[].ref",
".[].tag_name",
"-F",
"per_page=20",
]
)
tags: List[TagInfo] = []
for line in output.splitlines():
ref = line.strip()
if not ref.startswith("refs/tags/v"):
tag = line.strip()
if not tag.startswith("v"):
continue
tag = ref.split("refs/tags/")[-1]
version = tag.lstrip("v")
try:
tags.append(TagInfo(tag=tag, version=version, semver=parse_semver(version)))
except ValueError:
continue
if not tags:
raise RuntimeError("No Lance tags could be parsed from GitHub API output")
raise RuntimeError("No Lance releases could be parsed from GitHub API output")
return tags

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId>
<version>0.30.0-beta.0</version>
<version>0.30.0-beta.1</version>
</dependency>
```

View File

@@ -701,11 +701,15 @@ LSM-style write path for future `mergeInsert` calls.
`LsmWriteSpec` chooses one of three sharding strategies via `specType`:
- `"bucket"` — hash-bucket writes by a scalar `column` (`column` and
`numBuckets` required).
- `"bucket"` — hash-bucket writes by the single-column unenforced primary
key (`column` and `numBuckets` required).
- `"identity"` — shard by the raw value of a scalar `column`.
- `"unsharded"` — route every write to a single shard.
All variants require the table to have an unenforced primary key
([Table#setUnenforcedPrimaryKey](Table.md#setunenforcedprimarykey)); bucket sharding additionally
requires it to be the single column being bucketed.
#### Parameters
* **spec**: [`LsmWriteSpec`](../interfaces/LsmWriteSpec.md)
@@ -718,6 +722,7 @@ LSM-style write path for future `mergeInsert` calls.
#### Example
```ts
await table.setUnenforcedPrimaryKey("id");
await table.setLsmWriteSpec({
specType: "bucket",
column: "id",

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.30.0-beta.0</version>
<version>0.30.0-beta.1</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.30.0-beta.0</version>
<version>0.30.0-beta.1</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>7.0.0-rc.1</lance-core.version>
<lance-core.version>7.0.0-beta.13</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.30.0-beta.0"
version = "0.30.0-beta.1"
publish = false
license.workspace = true
description.workspace = true

View File

@@ -7,13 +7,6 @@ import * as tmp from "tmp";
import { Connection, Table, connect, connectNamespace } from "../lancedb";
import { LocalTable } from "../lancedb/table";
// The `_versions` directory may also contain `latest_version_hint.json`, which
// Lance writes on non-lexically-ordered stores (e.g. the local filesystem) to
// speed up latest-version lookup. Only assert on manifest files.
function manifestFiles(versionsDir: string): string[] {
return readdirSync(versionsDir).filter((f) => f.endsWith(".manifest"));
}
describe("when connecting", () => {
let tmpDir: tmp.DirResult;
beforeEach(() => {
@@ -178,7 +171,7 @@ describe("given a connection", () => {
let manifestDir =
tmpDir.name + "/test_manifest_paths_v2_empty.lance/_versions";
manifestFiles(manifestDir).forEach((file) => {
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
@@ -187,7 +180,7 @@ describe("given a connection", () => {
})) as LocalTable;
expect(await table.usesV2ManifestPaths()).toBe(true);
manifestDir = tmpDir.name + "/test_manifest_paths_v2.lance/_versions";
manifestFiles(manifestDir).forEach((file) => {
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
});
@@ -206,14 +199,14 @@ describe("given a connection", () => {
const manifestDir =
tmpDir.name + "/test_manifest_path_migration.lance/_versions";
manifestFiles(manifestDir).forEach((file) => {
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d\.manifest$/);
});
await table.migrateManifestPathsV2();
expect(await table.usesV2ManifestPaths()).toBe(true);
manifestFiles(manifestDir).forEach((file) => {
readdirSync(manifestDir).forEach((file) => {
expect(file).toMatch(/^\d{20}\.manifest$/);
});
});

View File

@@ -537,14 +537,19 @@ export abstract class Table {
*
* `LsmWriteSpec` chooses one of three sharding strategies via `specType`:
*
* - `"bucket"` — hash-bucket writes by a scalar `column` (`column` and
* `numBuckets` required).
* - `"bucket"` — hash-bucket writes by the single-column unenforced primary
* key (`column` and `numBuckets` required).
* - `"identity"` — shard by the raw value of a scalar `column`.
* - `"unsharded"` — route every write to a single shard.
*
* All variants require the table to have an unenforced primary key
* ({@link Table#setUnenforcedPrimaryKey}); bucket sharding additionally
* requires it to be the single column being bucketed.
* @param {LsmWriteSpec} spec The sharding spec to install.
* @returns {Promise<void>}
* @example
* ```ts
* await table.setUnenforcedPrimaryKey("id");
* await table.setLsmWriteSpec({
* specType: "bucket",
* column: "id",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.30.0-beta.0",
"version": "0.30.0-beta.1",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.33.0-beta.0"
current_version = "0.33.0-beta.1"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.33.0-beta.0"
version = "0.33.0-beta.1"
publish = false
edition.workspace = true
description = "Python bindings for LanceDB"

View File

@@ -147,6 +147,13 @@ def connect(
>>> db = lancedb.connect("s3://my-bucket/lancedb",
... storage_options={"aws_access_key_id": "***"})
For tests and temporary data, use an in-memory database:
>>> db = lancedb.connect("memory://")
In-memory databases are not persisted. Tables are dropped when the last
connection or table handle referencing them is closed.
Connect to LanceDB cloud:
>>> db = lancedb.connect("db://my_database", api_key="ldb_...",
@@ -378,6 +385,8 @@ async def connect_async(
... db = await lancedb.connect_async("s3://my-bucket/lancedb",
... storage_options={
... "aws_access_key_id": "***"})
... # For tests and temporary data, use an in-memory database
... db = await lancedb.connect_async("memory://")
... # Connect to LanceDB cloud
... db = await lancedb.connect_async("db://my_database", api_key="ldb_...",
... client_config={

View File

@@ -3875,11 +3875,15 @@ class AsyncTable:
strategies:
- ``LsmWriteSpec.bucket(column, num_buckets)`` — hash-bucket writes by
a scalar column.
the single-column unenforced primary key.
- ``LsmWriteSpec.identity(column)`` — shard by the raw value of a
scalar column.
- ``LsmWriteSpec.unsharded()`` — route every write to a single shard.
All variants require the table to have an unenforced primary key set
via [`set_unenforced_primary_key`]; bucket sharding additionally
requires it to be the single column being bucketed.
Parameters
----------
spec : LsmWriteSpec
@@ -3888,6 +3892,7 @@ class AsyncTable:
Examples
--------
>>> from lancedb._lancedb import LsmWriteSpec
>>> # table.set_unenforced_primary_key("id")
>>> # table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 16))
"""
await self._inner.set_lsm_write_spec(spec)

View File

@@ -450,13 +450,6 @@ async def test_create_exist_ok_async(tmp_db_async: lancedb.AsyncConnection):
# await db.create_table("test", schema=bad_schema, exist_ok=True)
def _manifest_files(versions_dir):
# The `_versions` directory may also contain `latest_version_hint.json`,
# which Lance writes on non-lexically-ordered stores (e.g. the local
# filesystem) to speed up latest-version lookup. Only assert on manifests.
return [f for f in os.listdir(versions_dir) if f.endswith(".manifest")]
@pytest.mark.asyncio
async def test_create_table_v2_manifest_paths_async(tmp_path):
db_with_v2_paths = await lancedb.connect_async(
@@ -472,7 +465,7 @@ async def test_create_table_v2_manifest_paths_async(tmp_path):
)
assert await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_manifest_paths.lance" / "_versions"
for manifest in _manifest_files(manifests_dir):
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)
# Start a table in V1 mode then migrate
@@ -482,13 +475,13 @@ async def test_create_table_v2_manifest_paths_async(tmp_path):
)
assert not await tbl.uses_v2_manifest_paths()
manifests_dir = tmp_path / "test_v2_migration.lance" / "_versions"
for manifest in _manifest_files(manifests_dir):
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d\.manifest", manifest)
await tbl.migrate_manifest_paths_v2()
assert await tbl.uses_v2_manifest_paths()
for manifest in _manifest_files(manifests_dir):
for manifest in os.listdir(manifests_dir):
assert re.match(r"\d{20}\.manifest", manifest)

View File

@@ -40,6 +40,16 @@ def _make_table(tmp_path):
def test_set_lsm_write_spec_validates(tmp_path):
_db, table = _make_table(tmp_path)
# No PK set yet.
with pytest.raises(Exception, match="primary key"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.set_unenforced_primary_key("id")
# Column mismatch.
with pytest.raises(Exception, match="match"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("v", 4))
# Out-of-range num_buckets.
with pytest.raises(Exception, match="num_buckets"):
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 0))
@@ -60,6 +70,7 @@ def test_unset_lsm_write_spec(tmp_path):
table.unset_lsm_write_spec()
# Install a spec, then remove it; afterwards a fresh spec can be set.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
table.unset_lsm_write_spec()
# A second unset errors — there is no spec left to remove.
@@ -70,7 +81,9 @@ def test_unset_lsm_write_spec(tmp_path):
def test_set_unsharded_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Unsharded routes every write to a single shard, skipping per-row hashing.
# Lance MemWAL still requires a primary key on the dataset; Unsharded
# just skips per-row hashing.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.unsharded())
table.unset_lsm_write_spec()
@@ -107,6 +120,7 @@ async def test_async_set_unset_lsm_write_spec(tmp_path):
pa.RecordBatchReader.from_batches(SCHEMA, [_batch(["seed"], [0])]),
)
await table.set_unenforced_primary_key("id")
await table.set_lsm_write_spec(LsmWriteSpec.bucket("id", 4))
await table.unset_lsm_write_spec()
# A second unset errors.
@@ -116,7 +130,9 @@ async def test_async_set_unset_lsm_write_spec(tmp_path):
def test_set_identity_spec(tmp_path):
_db, table = _make_table(tmp_path)
# Identity sharding shards by the raw value of the given column.
# Identity sharding still requires an unenforced primary key on the
# table; it shards by the raw value of the given column.
table.set_unenforced_primary_key("id")
table.set_lsm_write_spec(LsmWriteSpec.identity("v"))
table.unset_lsm_write_spec()

View File

@@ -362,6 +362,22 @@ def test_table_create_indices():
schema=dict(
fields=[
dict(name="id", type={"type": "int64"}, nullable=False),
dict(name="text", type={"type": "string"}, nullable=False),
dict(
name="vector",
type={
"type": "fixed_size_list",
"fields": [
dict(
name="item",
type={"type": "float"},
nullable=True,
)
],
"length": 2,
},
nullable=False,
),
]
),
)

View File

@@ -185,7 +185,7 @@ pub struct LsmWriteSpec {
#[pymethods]
impl LsmWriteSpec {
/// Hash-bucket sharding by a scalar column.
/// Hash-bucket sharding by the unenforced primary key column.
#[staticmethod]
pub fn bucket(column: String, num_buckets: u32) -> Self {
Self {

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.30.0-beta.0"
version = "0.30.0-beta.1"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true
@@ -104,6 +104,7 @@ datafusion.workspace = true
http-body = "1" # Matching reqwest
rstest = "0.23.0"
test-log = "0.2"
serial_test = "3"
[features]

View File

@@ -888,6 +888,7 @@ pub mod test_utils {
#[cfg(test)]
mod tests {
use super::*;
use serial_test::serial;
use std::time::Duration;
#[test]
@@ -1143,6 +1144,7 @@ mod tests {
}
#[test]
#[serial(user_id_env)]
fn test_resolve_user_id_none() {
let config = ClientConfig::default();
// Clear env vars that might be set from other tests
@@ -1155,6 +1157,7 @@ mod tests {
}
#[test]
#[serial(user_id_env)]
fn test_resolve_user_id_from_env() {
// SAFETY: This is only called in tests
unsafe {
@@ -1169,6 +1172,7 @@ mod tests {
}
#[test]
#[serial(user_id_env)]
fn test_resolve_user_id_from_env_key() {
// SAFETY: This is only called in tests
unsafe {
@@ -1189,6 +1193,7 @@ mod tests {
}
#[test]
#[serial(user_id_env)]
fn test_resolve_user_id_direct_takes_precedence() {
// SAFETY: This is only called in tests
unsafe {
@@ -1206,6 +1211,7 @@ mod tests {
}
#[test]
#[serial(user_id_env)]
fn test_resolve_user_id_empty_env_ignored() {
// SAFETY: This is only called in tests
unsafe {

View File

@@ -1528,8 +1528,10 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
});
}
};
let schema = self.schema().await?;
let (canonical_column, field) = resolve_arrow_field_path(&schema, &column)?;
let mut body = serde_json::json!({
"column": column
"column": canonical_column
});
// Add name parameter if provided (for backwards compatibility, only include if Some)
@@ -1564,8 +1566,6 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
Index::LabelList(p) => ("LABEL_LIST", Some(to_json(p)?)),
Index::FTS(p) => ("FTS", Some(to_json(p)?)),
Index::Auto => {
let schema = self.schema().await?;
let field = resolve_arrow_field_path(&schema, &column)?;
if supported_vector_data_type(field.data_type()) {
body[METRIC_TYPE_KEY] =
serde_json::Value::String(DistanceType::L2.to_string().to_lowercase());
@@ -1862,16 +1862,26 @@ impl<S: HttpSend> BaseTable for RemoteTable<S> {
status_code: None,
})?;
let schema = self.schema().await?;
// Make request to get stats for each index, so we get the index type.
// This is a bit inefficient, but it's the only way to get the index type.
let mut futures = Vec::with_capacity(body.indexes.len());
for index in body.indexes {
let columns = index
.columns
.iter()
.map(|column| {
resolve_arrow_field_path(&schema, column)
.map(|(canonical_column, _)| canonical_column)
})
.collect::<Result<Vec<_>>>()?;
let future = async move {
match self.index_stats(&index.index_name).await {
Ok(Some(stats)) => Ok(Some(IndexConfig {
name: index.index_name,
index_type: stats.index_type,
columns: index.columns,
columns,
})),
Ok(None) => Ok(None), // The index must have been deleted since we listed it.
Err(e) => Err(e),
@@ -2313,6 +2323,38 @@ mod tests {
.unwrap()
}
fn nested_index_schema() -> Schema {
let vector_type =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 8);
Schema::new(vec![
Field::new(
"metadata",
DataType::Struct(vec![Field::new("user_id", DataType::Int32, false)].into()),
false,
),
Field::new(
"image",
DataType::Struct(vec![Field::new("embedding", vector_type, false)].into()),
false,
),
Field::new(
"payload",
DataType::Struct(vec![Field::new("text", DataType::Utf8, false)].into()),
false,
),
Field::new(
"meta-data",
DataType::Struct(vec![Field::new("user-id", DataType::Int32, false)].into()),
false,
),
Field::new(
"literal",
DataType::Struct(vec![Field::new("a.b", DataType::Int32, false)].into()),
false,
),
])
}
#[rstest]
#[case("", 0)]
#[case("{}", 0)]
@@ -3079,6 +3121,59 @@ mod tests {
.unwrap();
}
#[tokio::test]
async fn test_query_vector_nested_field_path() {
let expected_data = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
)
.unwrap();
let expected_data_ref = expected_data.clone();
let table = Table::new_with_handler("my_table", move |request| {
assert_eq!(request.method(), "POST");
assert_eq!(request.url().path(), "/v1/table/my_table/query/");
assert_eq!(
request.headers().get("Content-Type").unwrap(),
JSON_CONTENT_TYPE
);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
let mut expected_body = serde_json::json!({
"vector_column": "image.embedding",
"prefilter": true,
"k": 10,
"nprobes": 20,
"minimum_nprobes": 20,
"maximum_nprobes": 20,
"lower_bound": Option::<f32>::None,
"upper_bound": Option::<f32>::None,
"ef": Option::<usize>::None,
"refine_factor": Option::<u32>::None,
"version": null,
});
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
assert_eq!(body, expected_body);
let response_body = write_ipc_file(&expected_data_ref);
http::Response::builder()
.status(200)
.header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
.body(response_body)
.unwrap()
});
let _ = table
.query()
.nearest_to(vec![0.1, 0.2, 0.3])
.unwrap()
.column("image.embedding")
.execute()
.await
.unwrap();
}
#[tokio::test]
async fn test_query_fts() {
let table = Table::new_with_handler("my_table", |request| {
@@ -3160,7 +3255,7 @@ mod tests {
"query": {
"match": {
"terms": "hello world",
"column": "a",
"column": "payload.text",
"boost": 1.0,
"fuzziness": 0,
"max_expansions": 50,
@@ -3194,7 +3289,7 @@ mod tests {
.query()
.full_text_search(FullTextSearchQuery::new_query(
MatchQuery::new("hello world".to_owned())
.with_column(Some("a".to_owned()))
.with_column(Some("payload.text".to_owned()))
.into(),
))
.with_row_id()
@@ -3465,32 +3560,152 @@ mod tests {
for (index_type, expected_body, index) in cases {
let table = Table::new_with_handler("my_table", move |request| {
assert_eq!(request.method(), "POST");
assert_eq!(request.url().path(), "/v1/table/my_table/create_index/");
assert_eq!(
request.headers().get("Content-Type").unwrap(),
JSON_CONTENT_TYPE
);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
let mut expected_body = expected_body.clone();
expected_body["column"] = "a".into();
expected_body[INDEX_TYPE_KEY] = index_type.into();
match request.url().path() {
"/v1/table/my_table/describe/" => {
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
http::Response::builder()
.status(200)
.body(describe_response(&schema))
.unwrap()
}
"/v1/table/my_table/create_index/" => {
assert_eq!(
request.headers().get("Content-Type").unwrap(),
JSON_CONTENT_TYPE
);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
let mut expected_body = expected_body.clone();
expected_body["column"] = "a".into();
expected_body[INDEX_TYPE_KEY] = index_type.into();
assert_eq!(body, expected_body);
assert_eq!(body, expected_body);
http::Response::builder().status(200).body("{}").unwrap()
http::Response::builder()
.status(200)
.body("{}".to_string())
.unwrap()
}
path => panic!("Unexpected path: {}", path),
}
});
table.create_index(&["a"], index).execute().await.unwrap();
}
}
#[tokio::test]
async fn test_create_index_nested_field_paths() {
let schema = nested_index_schema();
let expected_requests = Arc::new(vec![
json!({
"column": "metadata.user_id",
"index_type": "BTREE",
}),
json!({
"column": "image.embedding",
"index_type": "IVF_PQ",
"metric_type": "l2",
}),
{
let mut body = serde_json::to_value(InvertedIndexParams::default()).unwrap();
body["column"] = "payload.text".into();
body["index_type"] = "FTS".into();
body
},
json!({
"column": "`meta-data`.`user-id`",
"index_type": "BTREE",
}),
json!({
"column": "literal.`a.b`",
"index_type": "BTREE",
}),
]);
let request_idx = Arc::new(AtomicUsize::new(0));
let table = Table::new_with_handler("my_table", {
let schema = schema.clone();
let expected_requests = expected_requests.clone();
let request_idx = request_idx.clone();
move |request| {
assert_eq!(request.method(), "POST");
match request.url().path() {
"/v1/table/my_table/describe/" => http::Response::builder()
.status(200)
.body(describe_response(&schema))
.unwrap(),
"/v1/table/my_table/create_index/" => {
assert_eq!(
request.headers().get("Content-Type").unwrap(),
JSON_CONTENT_TYPE
);
let idx = request_idx.fetch_add(1, Ordering::SeqCst);
let body = request.body().unwrap().as_bytes().unwrap();
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
assert_eq!(body, expected_requests[idx]);
http::Response::builder()
.status(200)
.body("{}".to_string())
.unwrap()
}
path => panic!("Unexpected path: {}", path),
}
}
});
table
.create_index(&["Metadata.USER_ID"], Index::BTree(Default::default()))
.execute()
.await
.unwrap();
table
.create_index(&["Image.Embedding"], Index::Auto)
.execute()
.await
.unwrap();
table
.create_index(&["Payload.Text"], Index::FTS(Default::default()))
.execute()
.await
.unwrap();
table
.create_index(&["`META-DATA`.`USER-ID`"], Index::BTree(Default::default()))
.execute()
.await
.unwrap();
table
.create_index(&["literal.`A.B`"], Index::BTree(Default::default()))
.execute()
.await
.unwrap();
assert_eq!(request_idx.load(Ordering::SeqCst), expected_requests.len());
}
#[tokio::test]
async fn test_list_indices() {
let table = Table::new_with_handler("my_table", |request| {
let schema = Schema::new(vec![
Field::new(
"vector",
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 8),
false,
),
Field::new(
"metadata",
DataType::Struct(vec![Field::new("my.column", DataType::Utf8, true)].into()),
false,
),
]);
let table = Table::new_with_handler("my_table", move |request| {
assert_eq!(request.method(), "POST");
let response_body = match request.url().path() {
"/v1/table/my_table/describe/" => {
return http::Response::builder()
.status(200)
.body(describe_response(&schema))
.unwrap();
}
"/v1/table/my_table/index/list/" => {
serde_json::json!({
"indexes": [
@@ -4010,6 +4225,20 @@ mod tests {
assert_eq!(request.method(), "POST");
let response_body = match request.url().path() {
"/v1/table/my_table/describe/" => {
let schema = Schema::new(vec![
Field::new(
"vector",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
8,
),
false,
),
Field::new("my_column", DataType::Utf8, false),
]);
serde_json::from_str::<serde_json::Value>(&describe_response(&schema)).unwrap()
}
"/v1/table/my_table/index/list/" => {
serde_json::json!({
"indexes": [
@@ -4171,13 +4400,23 @@ mod tests {
assert_eq!(value["index_type"], "IVF_PQ");
}
http::Response::builder().status(200).body("").unwrap()
}
"/v1/table/dev$users/describe/" => {
// Needed for schema check in Auto index type
http::Response::builder()
.status(200)
.body(r#"{"version": 1, "schema": {"fields": [{"name": "embedding", "type": {"type": "list", "item": {"type": "float32"}}, "nullable": false}]}}"#)
.body("".to_string())
.unwrap()
}
"/v1/table/dev$users/describe/" => {
let schema = Schema::new(vec![Field::new(
"embedding",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
8,
),
false,
)]);
http::Response::builder()
.status(200)
.body(describe_response(&schema))
.unwrap()
}
_ => {

View File

@@ -282,15 +282,17 @@ pub use self::merge::MergeResult;
/// date) and [`LsmWriteSpec::with_writer_config_defaults`] (default
/// `ShardWriter` configuration recorded in the MemWAL index).
///
/// All variants require the table to have an unenforced primary key.
///
/// Install a spec with [`Table::set_lsm_write_spec`] and remove it with
/// [`Table::unset_lsm_write_spec`]. The actual `merge_insert` dispatch
/// onto the MemWAL writer is a follow-up.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum LsmWriteSpec {
/// Hash-bucket sharding by a scalar column.
/// Hash-bucket sharding by the unenforced primary key column.
///
/// `column` must be a non-nested column with a supported scalar type.
/// `num_buckets` must be in `[1, 1024]`.
/// `column` must equal the table's currently-set single-column
/// unenforced primary key. `num_buckets` must be in `[1, 1024]`.
/// Iceberg-compatible Murmur3-x86-32 (seed 0) is used so each row's
/// `bucket(column, num_buckets)` value is stable across processes.
Bucket {
@@ -1296,15 +1298,21 @@ impl Table {
///
/// [`LsmWriteSpec`] chooses one of three sharding strategies:
///
/// - [`LsmWriteSpec::bucket`] — hash-bucket writes by a scalar column.
/// - [`LsmWriteSpec::bucket`] — hash-bucket writes by the single-column
/// unenforced primary key.
/// - [`LsmWriteSpec::identity`] — shard by the raw value of a scalar column.
/// - [`LsmWriteSpec::unsharded`] — route every write to a single shard.
///
/// All variants require the table to have an unenforced primary key
/// ([`Table::set_unenforced_primary_key`]); bucket sharding additionally
/// requires it to be the single column being bucketed.
///
/// # Example
///
/// ```
/// # use lancedb::table::{LsmWriteSpec, Table};
/// # async fn example(table: &Table) -> Result<(), Box<dyn std::error::Error>> {
/// table.set_unenforced_primary_key(["id"]).await?;
/// table
/// .set_lsm_write_spec(
/// LsmWriteSpec::bucket("id", 16).with_maintained_indexes(["id_idx"]),
@@ -4592,6 +4600,21 @@ mod tests {
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Reject when no PK is set.
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await
.expect_err("should reject without PK");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
// Set PK, then a mismatched column on the spec must be rejected.
table.set_unenforced_primary_key(["id"]).await.unwrap();
let err = table
.set_lsm_write_spec(LsmWriteSpec::bucket("name", 4))
.await
.expect_err("should reject column != PK");
assert!(matches!(err, Error::Lance { .. }), "got {:?}", err);
// Reject num_buckets out of range.
for bad in [0u32, 1025] {
let err = table
@@ -4657,6 +4680,9 @@ mod tests {
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
// Lance's MemWAL still requires *some* unenforced primary key on
// the dataset; Unsharded just skips the per-row hashing step.
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(LsmWriteSpec::unsharded())
.await
@@ -4703,6 +4729,7 @@ mod tests {
.unwrap();
let table = conn.create_table("t", reader).execute().await.unwrap();
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(
LsmWriteSpec::identity("region")
@@ -4758,6 +4785,7 @@ mod tests {
table.unset_lsm_write_spec().await.unwrap_err();
// Install a spec, then unset it.
table.set_unenforced_primary_key(["id"]).await.unwrap();
table
.set_lsm_write_spec(LsmWriteSpec::bucket("id", 4))
.await

View File

@@ -6,7 +6,7 @@ pub(crate) mod background_cache;
use std::sync::Arc;
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef};
use arrow_schema::{DataType, Field, Schema, SchemaRef};
use datafusion_common::{DataFusionError, Result as DataFusionResult};
use datafusion_execution::RecordBatchStream;
use futures::{FutureExt, Stream};
@@ -199,38 +199,32 @@ fn collect_vector_columns(
path.pop();
}
pub(crate) fn resolve_arrow_field_path(schema: &Schema, column: &str) -> Result<Field> {
let segments =
lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput {
message: format!("Invalid field path `{}`: {}", column, e),
pub(crate) fn resolve_arrow_field_path(schema: &Schema, column: &str) -> Result<(String, Field)> {
lance_core::datatypes::parse_field_path(column).map_err(|e| Error::InvalidInput {
message: format!("Invalid field path `{}`: {}", column, e),
})?;
let lance_schema =
lance_core::datatypes::Schema::try_from(schema).map_err(|e| Error::Schema {
message: format!("Invalid schema: {}", e),
})?;
let mut fields = schema.fields();
for (idx, segment) in segments.iter().enumerate() {
let field = find_field(fields, segment).ok_or_else(|| Error::Schema {
message: format!("Field path `{}` not found in schema", column),
let field_path = lance_schema
.resolve_case_insensitive(column)
.ok_or_else(|| Error::Schema {
message: format!(
"Field path `{}` not found in schema. Available field paths: {}",
column,
lance_schema.field_paths().join(", ")
),
})?;
if idx + 1 == segments.len() {
return Ok(field.clone());
}
fields = match field.data_type() {
DataType::Struct(fields) => fields,
_ => {
return Err(Error::Schema {
message: format!("Field path `{}` not found in schema", column),
});
}
};
}
unreachable!("parse_field_path returns at least one segment")
}
fn find_field<'a>(fields: &'a Fields, name: &str) -> Option<&'a Field> {
fields
let field = field_path.last().expect("field path should be non-empty");
let path_segments = field_path
.iter()
.find(|field| field.name() == name)
.map(|field| field.as_ref())
.map(|field| field.name.as_str())
.collect::<Vec<_>>();
let canonical_path = lance_core::datatypes::format_field_path(&path_segments);
Ok((canonical_path, Field::from(*field)))
}
pub fn supported_btree_data_type(dtype: &DataType) -> bool {