Compare commits

..

8 Commits

Author SHA1 Message Date
Drew Gallardo
ba1ef34481 feat(rust): add blob v2 schema declaration and write path (#3528)
First Rust PR for #3231. Lance already stores blob v2. This adds the
LanceDB write side.

```rust
let schema = Schema::new(vec![
    Field::new("id", DataType::Int64, false),
    lancedb::blob("image", true),
]);

let table = db.create_table("photos", schema).execute().await?;

table.add(batch_with_large_binary_image_column).execute().await?;
```

Read/materialize and Python are follow-up PRs.

### Testing

- cargo test -p lancedb --test blob_integration
- cargo test -p lancedb blob:: datafusion::blob_coerce
- cargo test -p lancedb (591 passed)
- cargo clippy --features remote --tests

---------

Co-authored-by: Claude Fable 5 <noreply@anthropic.com>
2026-06-19 12:33:15 -07:00
Will Jones
85d870b397 fix: parse RFC 3339 created_at and improve IndexConfig repr (#3558)
The server now serializes an index's `created_at` as an RFC 3339 string
(e.g. `"2026-06-18T21:37:36.637Z"`), but the client deserializer only
accepted a unix timestamp in milliseconds. This caused `list_indices` to
fail with:

```
Failed to parse list_indices response: invalid type: string "2026-06-18T21:37:36.637Z", expected a unix timestamp in milliseconds
```

This PR replaces the fixed millisecond deserializer with a custom one
that accepts both an RFC 3339 string (current server) and a
unix-millisecond integer (legacy deployments), so the client works
against any server version.

It also improves the `IndexConfig` repr in the Python bindings.
Previously it printed only three fields (`Index(FTS, columns=["text"],
name="text_idx")`), hiding the metadata that `list_indices` returns. It
now renders every populated field, omitting any that are `None`. Each
value is valid Python — integer counts use `_` thousands separators and
`created_at` uses the `datetime` repr — so values round-trip. The real
repr is a single line; it's wrapped here for readability:

```python
>>> table.list_indices()
[IndexConfig(
    name="text_idx",
    index_type="FTS",
    columns=["text"],
    index_uuid="aefd3e00-2f95-4bdc-92ac-06de84442bf1",
    type_url="/lance.table.InvertedIndexDetails",
    created_at=datetime.datetime(2026, 6, 18, 21, 37, 36, 637000, tzinfo=datetime.timezone.utc),
    num_indexed_rows=2,
    size_bytes=3_669,
    num_segments=1,
    index_version=1,
    index_details={
        'lance_tokenizer': None,
        'base_tokenizer': 'simple',
        'language': 'English',
        'with_position': False,
        'max_token_length': 40,
        'lower_case': True,
        'stem': True,
        'remove_stop_words': True,
        'custom_stop_words': None,
        'ascii_folding': True,
        'min_ngram_length': 3,
        'max_ngram_length': 3,
        'prefix_only': False,
    },
)]
```

Fixes #3556

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 10:40:56 -07:00
LanceDB Robot
c46d59d2ee chore: update lance dependency to v8.0.0-rc.1 (#3557)
Updates LanceDB Lance dependencies to Lance v8.0.0-rc.1.

This includes the Rust workspace Lance crates, Cargo.lock, and Java
lance-core version. Triggering tag:
https://github.com/lance-format/lance/releases/tag/v8.0.0-rc.1
2026-06-19 11:40:38 -05:00
Lance Release
113f187c2d Bump version: 0.31.0-beta.0 → 0.31.0-beta.1 2026-06-19 16:00:59 +00:00
Lance Release
3b279f5705 Bump version: 0.34.0-beta.0 → 0.34.0-beta.1 2026-06-19 15:59:43 +00:00
Ryan Green
e1334954d7 fix: overflow using sys.maxsize for k in query with namespace connection (#3561) 2026-06-19 12:57:10 -02:30
LanceDB Robot
2f65a233fe chore: update lance dependency to v8.0.0-beta.19 (#3555)
Updates LanceDB's Lance dependencies from v8.0.0-beta.17 to
v8.0.0-beta.19.

This includes the Rust workspace Lance crates, Cargo.lock refresh, and
Java lance-core version bump. Triggering Lance tag:
https://github.com/lance-format/lance/releases/tag/v8.0.0-beta.19
2026-06-18 14:16:57 -05:00
Lance Release
e81356089a Bump version: 0.30.1-beta.2 → 0.31.0-beta.0 2026-06-18 18:43:22 +00:00
33 changed files with 1301 additions and 100 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.30.1-beta.2" current_version = "0.31.0-beta.1"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

96
Cargo.lock generated
View File

@@ -3432,8 +3432,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]] [[package]]
name = "fsst" name = "fsst"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"rand 0.9.4", "rand 0.9.4",
@@ -4735,8 +4735,8 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
[[package]] [[package]]
name = "lance" name = "lance"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"arrow", "arrow",
@@ -4810,8 +4810,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-arrow" name = "lance-arrow"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-buffer", "arrow-buffer",
@@ -4832,7 +4832,7 @@ dependencies = [
[[package]] [[package]]
name = "lance-arrow-scalar" name = "lance-arrow-scalar"
version = "58.0.0" version = "58.0.0"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-buffer", "arrow-buffer",
@@ -4846,7 +4846,7 @@ dependencies = [
[[package]] [[package]]
name = "lance-arrow-stats" name = "lance-arrow-stats"
version = "58.0.0" version = "58.0.0"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-schema", "arrow-schema",
@@ -4855,8 +4855,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-bitpacking" name = "lance-bitpacking"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrayref", "arrayref",
"paste", "paste",
@@ -4865,8 +4865,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-core" name = "lance-core"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-buffer", "arrow-buffer",
@@ -4904,8 +4904,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-datafusion" name = "lance-datafusion"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-array", "arrow-array",
@@ -4935,8 +4935,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-datagen" name = "lance-datagen"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-array", "arrow-array",
@@ -4953,8 +4953,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-derive" name = "lance-derive"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@@ -4963,8 +4963,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-encoding" name = "lance-encoding"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-arith", "arrow-arith",
"arrow-array", "arrow-array",
@@ -4999,8 +4999,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-file" name = "lance-file"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-arith", "arrow-arith",
"arrow-array", "arrow-array",
@@ -5030,8 +5030,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-index" name = "lance-index"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arc-swap", "arc-swap",
"arrow", "arrow",
@@ -5096,8 +5096,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-io" name = "lance-io"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-arith", "arrow-arith",
@@ -5138,8 +5138,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-linalg" name = "lance-linalg"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-buffer", "arrow-buffer",
@@ -5154,8 +5154,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-namespace" name = "lance-namespace"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow", "arrow",
"async-trait", "async-trait",
@@ -5167,8 +5167,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-namespace-impls" name = "lance-namespace-impls"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-ipc", "arrow-ipc",
@@ -5222,8 +5222,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-select" name = "lance-select"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-buffer", "arrow-buffer",
@@ -5238,8 +5238,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-table" name = "lance-table"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow", "arrow",
"arrow-array", "arrow-array",
@@ -5278,8 +5278,8 @@ dependencies = [
[[package]] [[package]]
name = "lance-testing" name = "lance-testing"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-schema", "arrow-schema",
@@ -5292,20 +5292,21 @@ dependencies = [
[[package]] [[package]]
name = "lance-tokenizer" name = "lance-tokenizer"
version = "8.0.0-beta.17" version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.17#0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c" source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
dependencies = [ dependencies = [
"icu_segmenter", "icu_segmenter",
"jieba-rs", "jieba-rs",
"lindera", "lindera",
"rust-stemmers", "rust-stemmers",
"serde", "serde",
"stop-words",
"unicode-normalization", "unicode-normalization",
] ]
[[package]] [[package]]
name = "lancedb" name = "lancedb"
version = "0.30.1-beta.2" version = "0.31.0-beta.1"
dependencies = [ dependencies = [
"ahash", "ahash",
"anyhow", "anyhow",
@@ -5388,7 +5389,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-nodejs" name = "lancedb-nodejs"
version = "0.30.1-beta.2" version = "0.31.0-beta.1"
dependencies = [ dependencies = [
"arrow-array", "arrow-array",
"arrow-buffer", "arrow-buffer",
@@ -5413,7 +5414,7 @@ dependencies = [
[[package]] [[package]]
name = "lancedb-python" name = "lancedb-python"
version = "0.33.1-beta.2" version = "0.34.0-beta.1"
dependencies = [ dependencies = [
"arrow", "arrow",
"async-trait", "async-trait",
@@ -9205,6 +9206,15 @@ version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978"
[[package]]
name = "stop-words"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d"
dependencies = [
"serde_json",
]
[[package]] [[package]]
name = "str_stack" name = "str_stack"
version = "0.1.1" version = "0.1.1"

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0" rust-version = "1.91.0"
[workspace.dependencies] [workspace.dependencies]
lance = { "version" = "=8.0.0-beta.17", default-features = false, "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-core = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-datagen = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-file = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=8.0.0-beta.17", default-features = false, "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-io = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-index = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-linalg = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-namespace = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=8.0.0-beta.17", default-features = false, "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-namespace-impls = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-table = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-testing = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-datafusion = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-encoding = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=8.0.0-beta.17", "tag" = "v8.0.0-beta.17", "git" = "https://github.com/lance-format/lance.git" } lance-arrow = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8" ahash = "0.8"
# Note that this one does not include pyarrow # Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false } arrow = { version = "58.0.0", optional = false }

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency> <dependency>
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId> <artifactId>lancedb-core</artifactId>
<version>0.30.1-beta.2</version> <version>0.31.0-beta.1</version>
</dependency> </dependency>
``` ```

View File

@@ -8,7 +8,7 @@
<parent> <parent>
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.30.1-beta.2</version> <version>0.31.0-beta.1</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId> <groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId> <artifactId>lancedb-parent</artifactId>
<version>0.30.1-beta.2</version> <version>0.31.0-beta.1</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>${project.artifactId}</name> <name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description> <description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version> <arrow.version>15.0.0</arrow.version>
<lance-core.version>8.0.0-beta.17</lance-core.version> <lance-core.version>8.0.0-rc.1</lance-core.version>
<spotless.skip>false</spotless.skip> <spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version> <spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version> <spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -1,7 +1,7 @@
[package] [package]
name = "lancedb-nodejs" name = "lancedb-nodejs"
edition.workspace = true edition.workspace = true
version = "0.30.1-beta.2" version = "0.31.0-beta.1"
publish = false publish = false
license.workspace = true license.workspace = true
description.workspace = true description.workspace = true

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-darwin-arm64", "name": "@lancedb/lancedb-darwin-arm64",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": ["darwin"], "os": ["darwin"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node", "main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-gnu", "name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node", "main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-arm64-musl", "name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["arm64"], "cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node", "main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-gnu", "name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node", "main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-linux-x64-musl", "name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": ["linux"], "os": ["linux"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node", "main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-arm64-msvc", "name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": [ "os": [
"win32" "win32"
], ],

View File

@@ -1,6 +1,6 @@
{ {
"name": "@lancedb/lancedb-win32-x64-msvc", "name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"os": ["win32"], "os": ["win32"],
"cpu": ["x64"], "cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node", "main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{ {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "@lancedb/lancedb", "name": "@lancedb/lancedb",
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"cpu": [ "cpu": [
"x64", "x64",
"arm64" "arm64"

View File

@@ -11,7 +11,7 @@
"ann" "ann"
], ],
"private": false, "private": false,
"version": "0.30.1-beta.2", "version": "0.31.0-beta.1",
"main": "dist/index.js", "main": "dist/index.js",
"exports": { "exports": {
".": "./dist/index.js", ".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion] [tool.bumpversion]
current_version = "0.34.0-beta.0" current_version = "0.34.0-beta.1"
parse = """(?x) parse = """(?x)
(?P<major>0|[1-9]\\d*)\\. (?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\. (?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb-python" name = "lancedb-python"
version = "0.34.0-beta.0" version = "0.34.0-beta.1"
publish = false publish = false
edition.workspace = true edition.workspace = true
description = "Python bindings for LanceDB" description = "Python bindings for LanceDB"

View File

@@ -71,6 +71,9 @@ from lancedb.embeddings import EmbeddingFunctionConfig
from ._lancedb import Session from ._lancedb import Session
_MAX_QUERY_K = 2**31 - 1
def _query_to_namespace_request( def _query_to_namespace_request(
table_id: List[str], table_id: List[str],
query: "Query", query: "Query",
@@ -148,7 +151,8 @@ def _query_to_namespace_request(
if query.limit is not None: if query.limit is not None:
k = query.limit k = query.limit
elif query.vector is None and query.full_text_query is None: elif query.vector is None and query.full_text_query is None:
k = sys.maxsize # limit k to max i32 value to avoid client overflows
k = _MAX_QUERY_K
else: else:
k = 10 k = 10

View File

@@ -91,7 +91,9 @@ async def test_create_scalar_index(some_table: AsyncTable):
# Can recreate if replace=True # Can recreate if replace=True
await some_table.create_index("id", replace=True) await some_table.create_index("id", replace=True)
indices = await some_table.list_indices() indices = await some_table.list_indices()
assert str(indices) == '[Index(BTree, columns=["id"], name="id_idx")]' assert str(indices).startswith(
'[IndexConfig(name="id_idx", index_type="BTree", columns=["id"]'
)
assert len(indices) == 1 assert len(indices) == 1
assert indices[0].index_type == "BTree" assert indices[0].index_type == "BTree"
assert indices[0].columns == ["id"] assert indices[0].columns == ["id"]
@@ -106,6 +108,27 @@ async def test_create_scalar_index(some_table: AsyncTable):
assert len(indices) == 0 assert len(indices) == 0
@pytest.mark.asyncio
async def test_index_config_repr(db_async):
# Use >= 1000 rows so the thousands separator in the repr is exercised.
nrows = 1500
table = await db_async.create_table(
"repr_table", pa.Table.from_pydict({"id": list(range(nrows))})
)
await table.create_index("id", config=BTree())
indices = await table.list_indices()
assert len(indices) == 1
r = repr(indices[0])
assert r.startswith('IndexConfig(name="id_idx", index_type="BTree", columns=["id"]')
# Integer counts use `_` thousands separators (valid Python int syntax).
assert "num_indexed_rows=1_500" in r
assert "num_unindexed_rows=0" in r
# created_at renders as a datetime so the value round-trips.
assert "created_at=datetime.datetime(" in r
assert r.endswith(")")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_create_nested_scalar_index_lists_canonical_paths(db_async): async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
metadata_type = pa.struct( metadata_type = pa.struct(
@@ -198,7 +221,9 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
async def test_create_fixed_size_binary_index(some_table: AsyncTable): async def test_create_fixed_size_binary_index(some_table: AsyncTable):
await some_table.create_index("fsb", config=BTree()) await some_table.create_index("fsb", config=BTree())
indices = await some_table.list_indices() indices = await some_table.list_indices()
assert str(indices) == '[Index(BTree, columns=["fsb"], name="fsb_idx")]' assert str(indices).startswith(
'[IndexConfig(name="fsb_idx", index_type="BTree", columns=["fsb"]'
)
assert len(indices) == 1 assert len(indices) == 1
assert indices[0].index_type == "BTree" assert indices[0].index_type == "BTree"
assert indices[0].columns == ["fsb"] assert indices[0].columns == ["fsb"]
@@ -247,7 +272,9 @@ async def test_create_bitmap_index(some_table: AsyncTable):
async def test_create_label_list_index(some_table: AsyncTable): async def test_create_label_list_index(some_table: AsyncTable):
await some_table.create_index("tags", config=LabelList()) await some_table.create_index("tags", config=LabelList())
indices = await some_table.list_indices() indices = await some_table.list_indices()
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]' assert str(indices).startswith(
'[IndexConfig(name="tags_idx", index_type="LabelList", columns=["tags"]'
)
plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan() plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan()
assert "ScalarIndexQuery" in plan assert "ScalarIndexQuery" in plan
@@ -262,7 +289,9 @@ async def test_create_large_list_label_list_index(db_async):
await table.create_index("tags", config=LabelList()) await table.create_index("tags", config=LabelList())
indices = await table.list_indices() indices = await table.list_indices()
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]' assert str(indices).startswith(
'[IndexConfig(name="tags_idx", index_type="LabelList", columns=["tags"]'
)
plan = await table.query().where("array_has(tags, 'shared')").explain_plan() plan = await table.query().where("array_has(tags, 'shared')").explain_plan()
assert "ScalarIndexQuery" in plan assert "ScalarIndexQuery" in plan
@@ -299,7 +328,9 @@ async def test_create_label_list_index_rejects_list_struct(db_async):
async def test_full_text_search_index(some_table: AsyncTable): async def test_full_text_search_index(some_table: AsyncTable):
await some_table.create_index("tags", config=FTS(with_position=False)) await some_table.create_index("tags", config=FTS(with_position=False))
indices = await some_table.list_indices() indices = await some_table.list_indices()
assert str(indices) == '[Index(FTS, columns=["tags"], name="tags_idx")]' assert str(indices).startswith(
'[IndexConfig(name="tags_idx", index_type="FTS", columns=["tags"]'
)
await some_table.prewarm_index("tags_idx") await some_table.prewarm_index("tags_idx")

View File

@@ -5,11 +5,11 @@
import tempfile import tempfile
import shutil import shutil
import sys
import pytest import pytest
import pyarrow as pa import pyarrow as pa
import lancedb import lancedb
from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError
from lancedb.namespace import _MAX_QUERY_K
from lancedb.table import AsyncTable, LanceTable from lancedb.table import AsyncTable, LanceTable
@@ -816,10 +816,13 @@ class TestPushdownOperations:
["geneva", "hist"], ["geneva", "hist"],
["geneva", "hist"], ["geneva", "hist"],
] ]
# Unlimited reads cap k at i32::MAX (the namespace query_table `k`
# field is i32); sys.maxsize would overflow the Rust binding.
assert [request.k for request in namespace_client.requests] == [ assert [request.k for request in namespace_client.requests] == [
sys.maxsize, _MAX_QUERY_K,
sys.maxsize, _MAX_QUERY_K,
] ]
assert all(r.k <= 2**31 - 1 for r in namespace_client.requests)
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -874,10 +877,13 @@ class TestAsyncPushdownOperations:
["geneva", "hist"], ["geneva", "hist"],
["geneva", "hist"], ["geneva", "hist"],
] ]
# Unlimited reads cap k at i32::MAX (the namespace query_table `k`
# field is i32); sys.maxsize would overflow the Rust binding.
assert [request.k for request in namespace_client.requests] == [ assert [request.k for request in namespace_client.requests] == [
sys.maxsize, _MAX_QUERY_K,
sys.maxsize, _MAX_QUERY_K,
] ]
assert all(r.k <= 2**31 - 1 for r in namespace_client.requests)
def test_local_table_to_arrow_and_to_pandas_are_unchanged(tmp_path): def test_local_table_to_arrow_and_to_pandas_are_unchanged(tmp_path):

View File

@@ -319,11 +319,53 @@ pub struct IndexConfig {
#[pymethods] #[pymethods]
impl IndexConfig { impl IndexConfig {
pub fn __repr__(&self) -> String { pub fn __repr__(&self, py: Python<'_>) -> String {
format!( let mut fields = vec![
"Index({}, columns={:?}, name=\"{}\")", format!("name={:?}", self.name),
self.index_type, self.columns, self.name format!("index_type={:?}", self.index_type),
) format!("columns={:?}", self.columns),
];
if let Some(v) = &self.index_uuid {
fields.push(format!("index_uuid={:?}", v));
}
if let Some(v) = &self.type_url {
fields.push(format!("type_url={:?}", v));
}
if let Some(v) = self.created_at {
// Render the datetime's own Python repr so the value round-trips,
// falling back to RFC 3339 if the conversion ever fails.
let rendered = v
.into_pyobject(py)
.ok()
.and_then(|obj| obj.into_any().repr().ok())
.map(|r| r.to_string())
.unwrap_or_else(|| v.to_rfc3339());
fields.push(format!("created_at={}", rendered));
}
if let Some(v) = self.num_indexed_rows {
fields.push(format!("num_indexed_rows={}", fmt_thousands(v)));
}
if let Some(v) = self.num_unindexed_rows {
fields.push(format!("num_unindexed_rows={}", fmt_thousands(v)));
}
if let Some(v) = self.size_bytes {
fields.push(format!("size_bytes={}", fmt_thousands(v)));
}
if let Some(v) = self.num_segments {
fields.push(format!("num_segments={}", v));
}
if let Some(v) = self.index_version {
fields.push(format!("index_version={}", v));
}
if let Some(v) = &self.index_details {
let details = v
.bind(py)
.repr()
.map(|r| r.to_string())
.unwrap_or_else(|_| "<unavailable>".to_string());
fields.push(format!("index_details={}", details));
}
format!("IndexConfig({})", fields.join(", "))
} }
// For backwards-compatibility with the old sync SDK, we also support getting // For backwards-compatibility with the old sync SDK, we also support getting
@@ -352,6 +394,23 @@ impl IndexConfig {
} }
} }
/// Format an integer with `_` thousands separators, e.g. `24_500_213`.
///
/// Underscores are valid Python int-literal syntax, so the repr stays
/// copy-pasteable and machine-parseable while remaining readable.
fn fmt_thousands(n: u64) -> String {
let digits = n.to_string();
let bytes = digits.as_bytes();
let mut out = String::with_capacity(digits.len() + digits.len() / 3);
for (i, b) in bytes.iter().enumerate() {
if i > 0 && (bytes.len() - i).is_multiple_of(3) {
out.push('_');
}
out.push(*b as char);
}
out
}
fn parse_index_details(py: Python<'_>, s: String) -> Py<PyAny> { fn parse_index_details(py: Python<'_>, s: String) -> Py<PyAny> {
let json = py.import("json").expect("json module is always available"); let json = py.import("json").expect("json module is always available");
match json.call_method1("loads", (s.as_str(),)) { match json.call_method1("loads", (s.as_str(),)) {

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "lancedb" name = "lancedb"
version = "0.30.1-beta.2" version = "0.31.0-beta.1"
edition.workspace = true edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications" description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true license.workspace = true

126
rust/lancedb/src/blob.rs Normal file
View File

@@ -0,0 +1,126 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Lance blob v2 columns store large binary payloads out of line.
//!
//! Declare a column with [`blob`]. On write, [`crate::table::Table::add`] coerces
//! raw `Binary` / `LargeBinary` into the blob struct layout. Queries return
//! small descriptors, not bytes.
//!
//! Blob tables require Lance file format >= 2.2 and stable row ids at create.
use arrow_schema::{Field, Schema};
use lance::dataset::WriteParams;
use lance_arrow::FieldExt;
use lance_encoding::version::LanceFileVersion;
/// Creates an Arrow field for a Lance blob v2 column.
///
/// `Struct<data, uri>` with the `lance.blob.v2` marker. Same layout Lance
/// expects on write.
///
/// ```
/// use arrow_schema::{DataType, Field, Schema};
///
/// let schema = Schema::new(vec![
/// Field::new("id", DataType::Int64, false),
/// lancedb::blob("image", true),
/// ]);
/// ```
///
/// Blob tables use Lance file format >= 2.2 and stable row ids at create.
pub fn blob(name: impl AsRef<str>, nullable: bool) -> Field {
lance::blob::blob_field(name.as_ref(), nullable)
}
/// Returns true if `schema` declares any blob v2 column.
pub(crate) fn has_blob_columns(schema: &Schema) -> bool {
schema.fields().iter().any(|field| field.is_blob_v2())
}
/// Bumps storage format to at least [`LanceFileVersion::V2_2`] for blob schemas.
pub(crate) fn ensure_blob_storage_version(schema: &Schema, params: &mut WriteParams) {
if !has_blob_columns(schema) {
return;
}
let resolved = params
.data_storage_version
.unwrap_or(LanceFileVersion::Stable)
.resolve();
if resolved < LanceFileVersion::V2_2 {
params.data_storage_version = Some(LanceFileVersion::V2_2);
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::DataType;
use lance_arrow::ARROW_EXT_NAME_KEY;
fn blob_schema() -> Schema {
Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob("image", true),
])
}
#[test]
fn blob_field_carries_v2_extension_marker() {
let field = blob("image", true);
assert_eq!(
field.metadata().get(ARROW_EXT_NAME_KEY).map(String::as_str),
Some("lance.blob.v2")
);
assert!(matches!(field.data_type(), DataType::Struct(_)));
}
#[test]
fn has_blob_columns_detects_blob_fields() {
assert!(has_blob_columns(&blob_schema()));
let plain = Schema::new(vec![Field::new("id", DataType::Int64, false)]);
assert!(!has_blob_columns(&plain));
}
#[test]
fn storage_version_bumps_to_v2_2() {
let mut params = WriteParams::default();
ensure_blob_storage_version(&blob_schema(), &mut params);
assert_eq!(
params.data_storage_version.unwrap().resolve(),
LanceFileVersion::V2_2
);
}
#[test]
fn storage_version_overrides_lower_explicit_version() {
let mut params = WriteParams {
data_storage_version: Some(LanceFileVersion::V2_0),
..Default::default()
};
ensure_blob_storage_version(&blob_schema(), &mut params);
assert_eq!(
params.data_storage_version.unwrap().resolve(),
LanceFileVersion::V2_2
);
}
#[test]
fn storage_version_keeps_higher_explicit_version() {
let mut params = WriteParams {
data_storage_version: Some(LanceFileVersion::V2_3),
..Default::default()
};
ensure_blob_storage_version(&blob_schema(), &mut params);
assert_eq!(params.data_storage_version.unwrap(), LanceFileVersion::V2_3);
}
#[test]
fn storage_version_noop_without_blob_columns() {
let schema = Schema::new(vec![Field::new("id", DataType::Int64, false)]);
let mut params = WriteParams::default();
ensure_blob_storage_version(&schema, &mut params);
assert!(params.data_storage_version.is_none());
}
}

View File

@@ -18,6 +18,7 @@ use lance_table::io::commit::commit_handler_from_url;
use object_store::local::LocalFileSystem; use object_store::local::LocalFileSystem;
use snafu::ResultExt; use snafu::ResultExt;
use crate::blob::{ensure_blob_storage_version, has_blob_columns};
use crate::connection::ConnectRequest; use crate::connection::ConnectRequest;
use crate::database::ReadConsistency; use crate::database::ReadConsistency;
use crate::database::namespace::LanceNamespaceDatabase; use crate::database::namespace::LanceNamespaceDatabase;
@@ -838,13 +839,16 @@ impl ListingDatabase {
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths; write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
} }
// Apply enable_stable_row_ids: table-level override takes precedence over connection config let data_schema = request.data.arrow_schema();
if let Some(enable_stable_row_ids) = if let Some(enable_stable_row_ids) = stable_row_ids_override
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids) .or(self.new_table_config.enable_stable_row_ids)
.or(has_blob_columns(&data_schema).then_some(true))
{ {
write_params.enable_stable_row_ids = enable_stable_row_ids; write_params.enable_stable_row_ids = enable_stable_row_ids;
} }
ensure_blob_storage_version(&data_schema, &mut write_params);
if matches!(&request.mode, CreateTableMode::Overwrite) { if matches!(&request.mode, CreateTableMode::Overwrite) {
write_params.mode = WriteMode::Overwrite; write_params.mode = WriteMode::Overwrite;
} }

View File

@@ -23,6 +23,7 @@ use lance_namespace_impls::ConnectBuilder;
use lance_table::io::commit::CommitHandler; use lance_table::io::commit::CommitHandler;
use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler; use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler;
use crate::blob::{ensure_blob_storage_version, has_blob_columns};
use crate::connection::NamespaceClientPushdownOperation; use crate::connection::NamespaceClientPushdownOperation;
use crate::database::ReadConsistency; use crate::database::ReadConsistency;
use crate::database::listing::{ use crate::database::listing::{
@@ -257,12 +258,16 @@ impl LanceNamespaceDatabase {
params.enable_v2_manifest_paths = enable_v2_manifest_paths; params.enable_v2_manifest_paths = enable_v2_manifest_paths;
} }
if let Some(enable_stable_row_ids) = let data_schema = request.data.schema();
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids) if let Some(enable_stable_row_ids) = stable_row_ids_override
.or(self.new_table_config.enable_stable_row_ids)
.or(has_blob_columns(data_schema.as_ref()).then_some(true))
{ {
params.enable_stable_row_ids = enable_stable_row_ids; params.enable_stable_row_ids = enable_stable_row_ids;
} }
ensure_blob_storage_version(data_schema.as_ref(), params);
Ok(()) Ok(())
} }
} }

View File

@@ -163,6 +163,7 @@
//! ``` //! ```
pub mod arrow; pub mod arrow;
pub mod blob;
pub mod connection; pub mod connection;
pub mod data; pub mod data;
pub mod database; pub mod database;
@@ -188,6 +189,7 @@ use std::{fmt::Display, str::FromStr};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub use blob::blob;
pub use connection::{ConnectNamespaceBuilder, Connection}; pub use connection::{ConnectNamespaceBuilder, Connection};
pub use error::{Error, Result}; pub use error::{Error, Result};
use lance_index::vector::ApproxMode as LanceApproxMode; use lance_index::vector::ApproxMode as LanceApproxMode;

View File

@@ -1352,6 +1352,35 @@ impl<S: HttpSend + 'static> RemoteTable<S> {
} }
} }
/// Deserialize an index's `created_at` field.
///
/// The server returns this as an RFC 3339 string (e.g. `"2026-06-18T21:37:36.637Z"`),
/// but older deployments sent a unix timestamp in milliseconds. Accept both so the
/// client works against any server version.
fn deserialize_created_at<'de, D>(
deserializer: D,
) -> std::result::Result<Option<DateTime<Utc>>, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde::de::Error as _;
#[derive(Deserialize)]
#[serde(untagged)]
enum CreatedAt {
Rfc3339(String),
Millis(i64),
}
match Option::<CreatedAt>::deserialize(deserializer)? {
None => Ok(None),
Some(CreatedAt::Rfc3339(s)) => DateTime::parse_from_rfc3339(&s)
.map(|dt| Some(dt.with_timezone(&Utc)))
.map_err(D::Error::custom),
Some(CreatedAt::Millis(ms)) => Ok(DateTime::from_timestamp_millis(ms)),
}
}
impl<S: HttpSend + 'static> RemoteTable<S> { impl<S: HttpSend + 'static> RemoteTable<S> {
/// Parse the response from `/index/list/` into `IndexConfig` entries. /// Parse the response from `/index/list/` into `IndexConfig` entries.
/// ///
@@ -1380,7 +1409,7 @@ impl<S: HttpSend + 'static> RemoteTable<S> {
// Used as the sentinel to decide whether to skip the stats call. // Used as the sentinel to decide whether to skip the stats call.
index_type: Option<IndexType>, index_type: Option<IndexType>,
index_uuid: Option<String>, index_uuid: Option<String>,
#[serde(default, with = "chrono::serde::ts_milliseconds_option")] #[serde(default, deserialize_with = "deserialize_created_at")]
created_at: Option<DateTime<Utc>>, created_at: Option<DateTime<Utc>>,
num_indexed_rows: Option<u64>, num_indexed_rows: Option<u64>,
num_unindexed_rows: Option<u64>, num_unindexed_rows: Option<u64>,
@@ -4678,7 +4707,7 @@ mod tests {
"num_segments": 2, "num_segments": 2,
"index_version": 1, "index_version": 1,
"index_details": "{\"num_partitions\":16}", "index_details": "{\"num_partitions\":16}",
"created_at": 1700000000000i64, "created_at": "2026-06-18T21:37:36.637Z",
"type_url": "type.googleapis.com/lance.index.vector.IvfPq", "type_url": "type.googleapis.com/lance.index.vector.IvfPq",
}, },
{ {
@@ -4728,7 +4757,10 @@ mod tests {
vec_idx.type_url, vec_idx.type_url,
Some("type.googleapis.com/lance.index.vector.IvfPq".to_string()) Some("type.googleapis.com/lance.index.vector.IvfPq".to_string())
); );
assert!(vec_idx.created_at.is_some()); assert_eq!(
vec_idx.created_at,
Some("2026-06-18T21:37:36.637Z".parse::<DateTime<Utc>>().unwrap())
);
let text_idx = &indices[1]; let text_idx = &indices[1];
assert_eq!(text_idx.name, "text_idx"); assert_eq!(text_idx.name, "text_idx");
@@ -4749,6 +4781,36 @@ mod tests {
assert_eq!(text_idx.created_at, None); assert_eq!(text_idx.created_at, None);
} }
#[test]
fn test_deserialize_created_at() {
#[derive(Deserialize)]
struct Wrapper {
#[serde(default, deserialize_with = "deserialize_created_at")]
created_at: Option<DateTime<Utc>>,
}
// RFC 3339 string (current server format).
let w: Wrapper =
serde_json::from_str(r#"{"created_at": "2026-06-18T21:37:36.637Z"}"#).unwrap();
assert_eq!(
w.created_at,
Some("2026-06-18T21:37:36.637Z".parse::<DateTime<Utc>>().unwrap())
);
// Unix milliseconds (legacy server format).
let w: Wrapper = serde_json::from_str(r#"{"created_at": 1700000000000}"#).unwrap();
assert_eq!(w.created_at, DateTime::from_timestamp_millis(1700000000000));
// Null and missing both yield None.
let w: Wrapper = serde_json::from_str(r#"{"created_at": null}"#).unwrap();
assert_eq!(w.created_at, None);
let w: Wrapper = serde_json::from_str(r#"{}"#).unwrap();
assert_eq!(w.created_at, None);
// A malformed string is rejected rather than silently dropped to None.
assert!(serde_json::from_str::<Wrapper>(r#"{"created_at": "not-a-date"}"#).is_err());
}
#[tokio::test] #[tokio::test]
async fn test_list_versions() { async fn test_list_versions() {
let table = Table::new_with_handler("my_table", |request| { let table = Table::new_with_handler("my_table", |request| {

View File

@@ -26,6 +26,9 @@ pub enum AddDataMode {
#[default] #[default]
Append, Append,
/// The existing table will be overwritten with the new data /// The existing table will be overwritten with the new data
///
/// On overwrite, raw binary is not coerced into a blob struct. The input
/// must declare blob v2 for the column to stay a blob column.
Overwrite, Overwrite,
} }

View File

@@ -3,6 +3,7 @@
//! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers. //! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers.
mod blob_coerce;
pub mod cast; pub mod cast;
pub mod insert; pub mod insert;
pub mod reject_nan; pub mod reject_nan;

View File

@@ -0,0 +1,495 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Coerces write-path input into blob v2 struct columns.
//!
//! [`super::cast::cast_to_table_schema`] calls [`coerce_blob_expr`].
use std::sync::Arc;
use arrow_schema::{DataType, Field, FieldRef};
use datafusion::functions::core::{get_field, named_struct};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_physical_expr::ScalarFunctionExpr;
use datafusion_physical_expr::expressions::{CastExpr, Literal};
use datafusion_physical_plan::PhysicalExpr;
use crate::error::{Error, Result};
/// Build a projection expression coercing `input_expr` into the blob struct
/// declared by `table_field`, composing `named_struct` / `get_field` / `cast`.
pub(super) fn coerce_blob_expr(
input_expr: Arc<dyn PhysicalExpr>,
input_field: &Field,
table_field: &FieldRef,
config: &Arc<ConfigOptions>,
) -> Result<(Arc<dyn PhysicalExpr>, FieldRef)> {
let DataType::Struct(declared_fields) = table_field.data_type() else {
return Err(Error::InvalidInput {
message: format!(
"blob v2 column '{}' must be a struct, table declares {}",
table_field.name(),
table_field.data_type()
),
});
};
let input_struct_children = match input_field.data_type() {
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
DataType::Struct(children) => {
if !children
.iter()
.any(|c| c.name() == "data" || c.name() == "uri")
{
return Err(Error::InvalidInput {
message: format!(
"blob struct input for column '{}' must contain a 'data' or 'uri' child",
table_field.name()
),
});
}
Some(children)
}
other => {
return Err(Error::InvalidInput {
message: format!(
"cannot coerce column '{}' with type {} into a blob v2 struct. \
expected Binary, LargeBinary, BinaryView, or a Struct with a 'data' or 'uri' child",
table_field.name(),
other,
),
});
}
};
let mut ns_args: Vec<Arc<dyn PhysicalExpr>> = Vec::with_capacity(declared_fields.len() * 2);
for declared in declared_fields.iter() {
ns_args.push(Arc::new(Literal::new(ScalarValue::from(
declared.name().as_str(),
))));
let value: Arc<dyn PhysicalExpr> = match input_struct_children {
// Raw binary lands in `data` and everything else is a typed null.
None => {
if declared.name() == "data" {
Arc::new(CastExpr::new(
input_expr.clone(),
declared.data_type().clone(),
None,
))
} else {
typed_null(declared.data_type())?
}
}
Some(children) => match children.iter().find(|c| c.name() == declared.name()) {
Some(child) => {
let field_expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
&format!("get_field({})", declared.name()),
get_field(),
vec![
input_expr.clone(),
Arc::new(Literal::new(ScalarValue::from(declared.name().as_str()))),
],
Arc::new(child.as_ref().clone()),
config.clone(),
));
if child.data_type() == declared.data_type() {
field_expr
} else {
Arc::new(CastExpr::new(
field_expr,
declared.data_type().clone(),
None,
))
}
}
None => typed_null(declared.data_type())?,
},
};
ns_args.push(value);
}
let expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
&format!("named_struct({})", table_field.name()),
named_struct(),
ns_args,
table_field.clone(),
config.clone(),
));
Ok((expr, table_field.clone()))
}
fn typed_null(data_type: &DataType) -> Result<Arc<dyn PhysicalExpr>> {
let scalar = ScalarValue::try_from(data_type).map_err(|e| Error::InvalidInput {
message: format!("cannot build null literal for blob child type {data_type}: {e}"),
})?;
Ok(Arc::new(Literal::new(scalar)))
}
#[cfg(test)]
mod tests {
use super::super::cast::cast_to_table_schema;
use super::*;
use crate::blob::blob;
use arrow_array::{
Array, ArrayRef, BinaryArray, BinaryViewArray, Int32Array, Int64Array, LargeBinaryArray,
RecordBatch, StringArray, StructArray, UInt8Array, UInt64Array,
};
use arrow_schema::Schema;
use datafusion::prelude::SessionContext;
use datafusion_catalog::MemTable;
use datafusion_physical_plan::ExecutionPlan;
use futures::TryStreamExt;
use lance_arrow::FieldExt;
use std::collections::HashMap;
fn wide_blob_field(name: &str) -> Field {
Field::new(
name,
DataType::Struct(
vec![
Field::new("data", DataType::LargeBinary, true),
Field::new("uri", DataType::Utf8, true),
Field::new("position", DataType::UInt64, true),
Field::new("size", DataType::UInt64, true),
]
.into(),
),
true,
)
.with_metadata(HashMap::from([(
"ARROW:extension:name".to_string(),
"lance.blob.v2".to_string(),
)]))
}
fn blob_table_schema() -> Schema {
Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob("image", true),
])
}
fn batch_with_image(image_field: Field, image: ArrayRef) -> RecordBatch {
let len = image.len();
RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
image_field,
])),
vec![Arc::new(Int64Array::from_iter_values(0..len as i64)), image],
)
.unwrap()
}
fn image_struct(batch: &RecordBatch) -> &StructArray {
batch
.column_by_name("image")
.unwrap()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
}
async fn plan_from_batch(batch: RecordBatch) -> Arc<dyn ExecutionPlan> {
let schema = batch.schema();
let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
let ctx = SessionContext::new();
ctx.register_table("t", Arc::new(table)).unwrap();
let df = ctx.table("t").await.unwrap();
df.create_physical_plan().await.unwrap()
}
async fn coerce(batch: RecordBatch, table_schema: &Schema) -> RecordBatch {
let plan = plan_from_batch(batch).await;
let plan = cast_to_table_schema(plan, table_schema).unwrap();
let ctx = SessionContext::new();
let stream = plan.execute(0, ctx.task_ctx()).unwrap();
let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
arrow_select::concat::concat_batches(&plan.schema(), &batches).unwrap()
}
async fn coerce_err(batch: RecordBatch, table_schema: &Schema) -> Error {
let plan = plan_from_batch(batch).await;
cast_to_table_schema(plan, table_schema).unwrap_err()
}
#[tokio::test]
async fn large_binary_coerces_to_declared_blob_struct() {
let batch = batch_with_image(
Field::new("image", DataType::LargeBinary, true),
Arc::new(LargeBinaryArray::from_iter_values([b"hello".as_slice()])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
let image_field = coerced.schema().field_with_name("image").unwrap().clone();
assert!(image_field.is_blob_v2());
assert!(matches!(image_field.data_type(), DataType::Struct(_)));
let data = image_struct(&coerced).column_by_name("data").unwrap();
let data: &LargeBinaryArray = data.as_any().downcast_ref().unwrap();
assert_eq!(data.value(0), b"hello");
}
#[tokio::test]
async fn binary_coerces_to_declared_blob_struct() {
let batch = batch_with_image(
Field::new("image", DataType::Binary, true),
Arc::new(BinaryArray::from_iter_values([b"hi".as_slice()])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
assert!(
coerced
.schema()
.field_with_name("image")
.unwrap()
.is_blob_v2()
);
}
#[tokio::test]
async fn binary_view_coerces_to_declared_blob_struct() {
let batch = batch_with_image(
Field::new("image", DataType::BinaryView, true),
Arc::new(BinaryViewArray::from_iter_values([b"view".as_slice()])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
let data = image_struct(&coerced).column_by_name("data").unwrap();
let data: &LargeBinaryArray = data.as_any().downcast_ref().unwrap();
assert_eq!(data.value(0), b"view");
}
#[tokio::test]
async fn binary_nulls_stay_null_after_coercion() {
let batch = batch_with_image(
Field::new("image", DataType::Binary, true),
Arc::new(BinaryArray::from_iter(vec![
Some(b"present".as_slice()),
None,
])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
let image = image_struct(&coerced);
let data = image.column_by_name("data").unwrap();
assert!(!data.is_null(0));
assert!(data.is_null(1));
}
#[tokio::test]
async fn binary_coerces_into_four_child_blob_layout() {
let table_schema = Schema::new(vec![
Field::new("id", DataType::Int64, false),
wide_blob_field("image"),
]);
let batch = batch_with_image(
Field::new("image", DataType::LargeBinary, true),
Arc::new(LargeBinaryArray::from_iter(vec![
Some(b"alpha".as_slice()),
None,
])),
);
let coerced = coerce(batch, &table_schema).await;
let image = image_struct(&coerced);
assert_eq!(
image.num_columns(),
4,
"coerced struct keeps the declared layout"
);
assert!(image.column_by_name("position").unwrap().is_null(0));
assert!(image.column_by_name("size").unwrap().is_null(0));
assert!(!image.column_by_name("data").unwrap().is_null(0));
assert!(image.column_by_name("data").unwrap().is_null(1));
}
#[tokio::test]
async fn prebuilt_struct_gains_blob_field_metadata() {
let DataType::Struct(children) = blob("image", true).data_type().clone() else {
unreachable!("blob field is a struct")
};
let prebuilt = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"prebuilt".as_slice()])),
Arc::new(StringArray::from(vec![None::<&str>])),
],
None,
);
let batch = batch_with_image(
Field::new("image", prebuilt.data_type().clone(), true),
Arc::new(prebuilt),
);
let coerced = coerce(batch, &blob_table_schema()).await;
assert!(
coerced
.schema()
.field_with_name("image")
.unwrap()
.is_blob_v2()
);
}
#[tokio::test]
async fn prebuilt_narrow_struct_widens_to_declared_layout() {
let DataType::Struct(narrow_children) = blob("image", true).data_type().clone() else {
unreachable!("blob field is a struct")
};
let prebuilt = StructArray::new(
narrow_children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"prebuilt".as_slice()])),
Arc::new(StringArray::from(vec![None::<&str>])),
],
None,
);
let table_schema = Schema::new(vec![
Field::new("id", DataType::Int64, false),
wide_blob_field("image"),
]);
let batch = batch_with_image(
Field::new("image", prebuilt.data_type().clone(), true),
Arc::new(prebuilt),
);
let coerced = coerce(batch, &table_schema).await;
let image = image_struct(&coerced);
assert_eq!(image.num_columns(), 4);
assert!(image.column_by_name("position").unwrap().is_null(0));
assert!(image.column_by_name("size").unwrap().is_null(0));
}
#[tokio::test]
async fn external_reference_struct_preserves_uri_position_and_size() {
let prebuilt = StructArray::new(
vec![
Field::new("data", DataType::LargeBinary, true),
Field::new("uri", DataType::Utf8, true),
Field::new("position", DataType::UInt64, true),
Field::new("size", DataType::UInt64, true),
]
.into(),
vec![
Arc::new(LargeBinaryArray::from(vec![None::<&[u8]>])) as ArrayRef,
Arc::new(StringArray::from(vec![Some("s3://bucket/blob.bin")])) as ArrayRef,
Arc::new(UInt64Array::from(vec![Some(7)])) as ArrayRef,
Arc::new(UInt64Array::from(vec![Some(6)])) as ArrayRef,
],
None,
);
let table_schema = Schema::new(vec![
Field::new("id", DataType::Int64, false),
wide_blob_field("image"),
]);
let batch = batch_with_image(
Field::new("image", prebuilt.data_type().clone(), true),
Arc::new(prebuilt),
);
let coerced = coerce(batch, &table_schema).await;
let image = image_struct(&coerced);
let uri: &StringArray = image
.column_by_name("uri")
.unwrap()
.as_any()
.downcast_ref()
.unwrap();
assert_eq!(uri.value(0), "s3://bucket/blob.bin");
let position: &UInt64Array = image
.column_by_name("position")
.unwrap()
.as_any()
.downcast_ref()
.unwrap();
assert_eq!(position.value(0), 7);
let size: &UInt64Array = image
.column_by_name("size")
.unwrap()
.as_any()
.downcast_ref()
.unwrap();
assert_eq!(size.value(0), 6);
assert!(image.column_by_name("data").unwrap().is_null(0));
}
#[tokio::test]
async fn descriptor_struct_without_value_child_is_rejected() {
let descriptor = StructArray::new(
vec![
Field::new("kind", DataType::UInt8, false),
Field::new("position", DataType::UInt64, false),
Field::new("size", DataType::UInt64, false),
]
.into(),
vec![
Arc::new(UInt8Array::from(vec![0])),
Arc::new(UInt64Array::from(vec![0])),
Arc::new(UInt64Array::from(vec![0])),
],
None,
);
let batch = batch_with_image(
Field::new("image", descriptor.data_type().clone(), true),
Arc::new(descriptor),
);
let err = coerce_err(batch, &blob_table_schema()).await;
assert!(err.to_string().contains("'data' or 'uri'"));
assert!(err.to_string().contains("image"));
}
#[tokio::test]
async fn unsupported_input_type_is_rejected_with_column_name() {
let batch = batch_with_image(
Field::new("image", DataType::Utf8, true),
Arc::new(StringArray::from(vec!["not bytes"])),
);
let err = coerce_err(batch, &blob_table_schema()).await;
assert!(matches!(err, Error::InvalidInput { .. }), "got {err:?}");
assert!(err.to_string().contains("image"));
}
#[tokio::test]
async fn blob_metadata_survives_cast_of_sibling_column() {
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("image", DataType::LargeBinary, true),
])),
vec![
Arc::new(Int32Array::from(vec![1])),
Arc::new(LargeBinaryArray::from_iter_values([b"x".as_slice()])),
],
)
.unwrap();
let coerced = coerce(batch, &blob_table_schema()).await;
let image_field = coerced.schema().field_with_name("image").unwrap().clone();
assert!(
image_field.is_blob_v2(),
"expected blob marker on image field, got {:?}",
image_field.metadata()
);
assert_eq!(
coerced.schema().field_with_name("id").unwrap().data_type(),
&DataType::Int64
);
}
#[tokio::test]
async fn exact_blob_input_passes_through_unchanged() {
let DataType::Struct(children) = blob("image", true).data_type().clone() else {
unreachable!("blob field is a struct")
};
let image = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"exact".as_slice()])),
Arc::new(StringArray::from(vec![None::<&str>])),
],
None,
);
let batch = batch_with_image(blob("image", true), Arc::new(image));
let table_schema = blob_table_schema();
let input = plan_from_batch(batch).await;
let input_ptr = Arc::as_ptr(&input);
let plan = cast_to_table_schema(input, &table_schema).unwrap();
assert_eq!(Arc::as_ptr(&plan), input_ptr, "no projection inserted");
}
}

View File

@@ -13,8 +13,10 @@ use datafusion_physical_expr::expressions::{CastExpr, Literal};
use datafusion_physical_plan::expressions::Column; use datafusion_physical_plan::expressions::Column;
use datafusion_physical_plan::projection::ProjectionExec; use datafusion_physical_plan::projection::ProjectionExec;
use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr}; use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
use lance_arrow::FieldExt;
use lance_arrow::json::{is_arrow_json_field, is_json_field}; use lance_arrow::json::{is_arrow_json_field, is_json_field};
use super::blob_coerce::coerce_blob_expr;
use crate::{Error, Result}; use crate::{Error, Result};
pub fn cast_to_table_schema( pub fn cast_to_table_schema(
@@ -77,6 +79,17 @@ fn build_field_exprs(
continue; continue;
} }
// Blob columns accept raw binary on write; exact matches pass through below.
if table_field.is_blob_v2() && input_field.as_ref() != table_field.as_ref() {
result.push(coerce_blob_expr(
input_expr,
input_field,
table_field,
&config,
)?);
continue;
}
let expr = match (input_field.data_type(), table_field.data_type()) { let expr = match (input_field.data_type(), table_field.data_type()) {
// Both are structs: recurse into sub-fields to handle subschemas and casts. // Both are structs: recurse into sub-fields to handle subschemas and casts.
(DataType::Struct(in_children), DataType::Struct(tbl_children)) (DataType::Struct(in_children), DataType::Struct(tbl_children))

View File

@@ -0,0 +1,380 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Integration tests for blob v2 columns.
use std::sync::Arc;
use arrow_array::{Array, BinaryArray, Int64Array, LargeBinaryArray, RecordBatch, StructArray};
use arrow_schema::{DataType, Field, Schema};
use futures::TryStreamExt;
use lance_encoding::version::LanceFileVersion;
use lancedb::{
Connection, Result, Table, blob::blob, connect,
database::listing::OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, query::ExecutableQuery,
};
use tempfile::tempdir;
fn blob_table_schema() -> Arc<Schema> {
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob("image", true),
]))
}
fn binary_input_batch(ids: &[i64], payloads: &[Option<&[u8]>]) -> RecordBatch {
RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::LargeBinary, true),
])),
vec![
Arc::new(Int64Array::from(ids.to_vec())),
Arc::new(LargeBinaryArray::from_iter(payloads.iter().copied())),
],
)
.unwrap()
}
async fn create_inline_blob_table(
db: &Connection,
name: &str,
ids: &[i64],
payloads: &[Option<&[u8]>],
) -> Result<Table> {
let table = db
.create_empty_table(name, blob_table_schema())
.execute()
.await?;
table
.add(binary_input_batch(ids, payloads))
.execute()
.await?;
Ok(table)
}
async fn storage_format_version(table: &Table) -> LanceFileVersion {
table
.as_native()
.unwrap()
.manifest()
.await
.unwrap()
.data_storage_format
.lance_file_version()
.unwrap()
.resolve()
}
async fn uses_stable_row_ids(table: &Table) -> bool {
table
.as_native()
.unwrap()
.manifest()
.await
.unwrap()
.uses_stable_row_ids()
}
async fn query_image_struct(table: &Table) -> StructArray {
let batches = table
.query()
.execute()
.await
.unwrap()
.try_collect::<Vec<_>>()
.await
.unwrap();
let batch = arrow_select::concat::concat_batches(&batches[0].schema(), &batches).unwrap();
batch
.column_by_name("image")
.expect("image column present")
.as_any()
.downcast_ref::<StructArray>()
.expect("blob column reads back as a descriptor struct")
.clone()
}
#[tokio::test]
async fn declaring_blob_column_bumps_format_and_enables_stable_row_ids() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn explicit_stable_row_id_setting_wins_over_blob_default() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.storage_option(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, "false")
.execute()
.await?;
assert!(
storage_format_version(&table).await >= LanceFileVersion::V2_2,
"format bump still applies; the schema cannot be written below 2.2"
);
assert!(!uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn non_blob_table_keeps_default_format_and_row_id_setting() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
let table = db.create_empty_table("t", schema).execute().await?;
assert!(storage_format_version(&table).await < LanceFileVersion::V2_2);
assert!(!uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn creating_with_blob_data_bumps_format() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
// Batch already declares the blob field (pre-built struct).
let blob_field = blob("image", true);
let DataType::Struct(children) = blob_field.data_type().clone() else {
unreachable!("blob field is a struct")
};
let image = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"payload".as_slice()])),
Arc::new(arrow_array::StringArray::from(vec![None::<&str>])),
],
None,
);
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob_field,
])),
vec![Arc::new(Int64Array::from(vec![1])), Arc::new(image)],
)
.unwrap();
let table = db.create_table("t", batch).execute().await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(uses_stable_row_ids(&table).await);
assert_eq!(table.count_rows(None).await?, 1);
Ok(())
}
#[tokio::test]
async fn add_coerces_large_binary_into_blob_column() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table =
create_inline_blob_table(&db, "t", &[1, 2], &[Some(b"cat".as_slice()), Some(b"dog")])
.await?;
assert_eq!(table.count_rows(None).await?, 2);
let image = query_image_struct(&table).await;
assert_eq!(image.len(), 2);
// Table schema still has the blob marker after append.
let schema = table.schema().await?;
let field = schema.field_with_name("image").unwrap();
assert_eq!(
field
.metadata()
.get("ARROW:extension:name")
.map(String::as_str),
Some("lance.blob.v2")
);
Ok(())
}
#[tokio::test]
async fn add_coerces_binary_into_blob_column() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::Binary, true),
])),
vec![
Arc::new(Int64Array::from(vec![1])),
Arc::new(BinaryArray::from_iter_values([b"small".as_slice()])),
],
)
.unwrap();
table.add(batch).execute().await?;
assert_eq!(table.count_rows(None).await?, 1);
Ok(())
}
#[tokio::test]
async fn add_accepts_null_blob_rows() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = create_inline_blob_table(
&db,
"t",
&[1, 2, 3],
&[Some(b"first".as_slice()), None, Some(b"third")],
)
.await?;
assert_eq!(table.count_rows(None).await?, 3);
let image = query_image_struct(&table).await;
assert_eq!(image.len(), 3);
Ok(())
}
#[tokio::test]
async fn add_rejects_uncoercible_blob_input() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::Utf8, true),
])),
vec![
Arc::new(Int64Array::from(vec![1])),
Arc::new(arrow_array::StringArray::from(vec!["not bytes"])),
],
)
.unwrap();
let err = table.add(batch).execute().await.unwrap_err();
assert!(err.to_string().contains("image"), "got: {err}");
Ok(())
}
#[tokio::test]
async fn connection_level_stable_row_id_setting_wins_over_blob_default() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap())
.storage_option(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, "false")
.execute()
.await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(!uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn namespace_create_applies_blob_defaults() -> Result<()> {
let tmp = tempdir().unwrap();
let mut properties = std::collections::HashMap::new();
properties.insert("root".to_string(), tmp.path().to_str().unwrap().to_string());
let db = lancedb::connect_namespace("dir", properties)
.execute()
.await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(uses_stable_row_ids(&table).await);
Ok(())
}
// Overwrite takes the input schema as-is (same as cast skip). Raw binary
// overwrite drops the blob marker unless the input declares blob v2.
#[tokio::test]
async fn overwrite_replaces_blob_schema_with_input_schema() -> Result<()> {
use lancedb::table::AddDataMode;
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = create_inline_blob_table(&db, "t", &[1], &[Some(b"blob".as_slice())]).await?;
// Raw binary overwrite. Plain LargeBinary replaces the blob declaration.
let raw_schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::LargeBinary, true),
]));
let raw_batch = RecordBatch::try_new(
raw_schema.clone(),
vec![
Arc::new(Int64Array::from(vec![2])),
Arc::new(LargeBinaryArray::from_iter_values([b"plain".as_slice()])),
],
)
.unwrap();
table
.add(raw_batch)
.mode(AddDataMode::Overwrite)
.execute()
.await?;
let schema = table.schema().await?;
assert_eq!(schema, raw_schema);
assert!(
!schema
.field_with_name("image")
.unwrap()
.metadata()
.contains_key("ARROW:extension:name"),
"raw binary overwrite leaves a plain binary column"
);
// Overwrite with a declared blob struct keeps the blob column.
let blob_field = blob("image", true);
let DataType::Struct(children) = blob_field.data_type().clone() else {
unreachable!("blob field is a struct")
};
let image = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"declared".as_slice()])),
Arc::new(arrow_array::StringArray::from(vec![None::<&str>])),
],
None,
);
let declared_batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob_field,
])),
vec![Arc::new(Int64Array::from(vec![3])), Arc::new(image)],
)
.unwrap();
table
.add(declared_batch)
.mode(AddDataMode::Overwrite)
.execute()
.await?;
let schema = table.schema().await?;
assert_eq!(
schema
.field_with_name("image")
.unwrap()
.metadata()
.get("ARROW:extension:name")
.map(String::as_str),
Some("lance.blob.v2")
);
Ok(())
}