mirror of
https://github.com/lancedb/lancedb.git
synced 2026-06-19 20:20:46 +00:00
Compare commits
4 Commits
v0.31.0-be
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba1ef34481 | ||
|
|
85d870b397 | ||
|
|
c46d59d2ee | ||
|
|
113f187c2d |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.31.0-beta.0"
|
||||
current_version = "0.31.0-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
86
Cargo.lock
generated
86
Cargo.lock
generated
@@ -3432,8 +3432,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "fsst"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"rand 0.9.4",
|
||||
@@ -4735,8 +4735,8 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
|
||||
|
||||
[[package]]
|
||||
name = "lance"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"arrow",
|
||||
@@ -4810,8 +4810,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-arrow"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4832,7 +4832,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-arrow-scalar"
|
||||
version = "58.0.0"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4846,7 +4846,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "lance-arrow-stats"
|
||||
version = "58.0.0"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
@@ -4855,8 +4855,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-bitpacking"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrayref",
|
||||
"paste",
|
||||
@@ -4865,8 +4865,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-core"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -4904,8 +4904,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datafusion"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4935,8 +4935,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-datagen"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -4953,8 +4953,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-derive"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -4963,8 +4963,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-encoding"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -4999,8 +4999,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-file"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-arith",
|
||||
"arrow-array",
|
||||
@@ -5030,8 +5030,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-index"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"arrow",
|
||||
@@ -5096,8 +5096,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-io"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-arith",
|
||||
@@ -5138,8 +5138,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-linalg"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -5154,8 +5154,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-namespace"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -5167,8 +5167,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-namespace-impls"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ipc",
|
||||
@@ -5222,8 +5222,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-select"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -5238,8 +5238,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
@@ -5278,8 +5278,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-testing"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
@@ -5292,8 +5292,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lance-tokenizer"
|
||||
version = "8.0.0-beta.19"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.19#5fd0659a49e4d080290438da7acd80f4e6b0f1d8"
|
||||
version = "8.0.0-rc.1"
|
||||
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
|
||||
dependencies = [
|
||||
"icu_segmenter",
|
||||
"jieba-rs",
|
||||
@@ -5306,7 +5306,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb"
|
||||
version = "0.31.0-beta.0"
|
||||
version = "0.31.0-beta.1"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
@@ -5389,7 +5389,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-nodejs"
|
||||
version = "0.31.0-beta.0"
|
||||
version = "0.31.0-beta.1"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
@@ -5414,7 +5414,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "lancedb-python"
|
||||
version = "0.34.0-beta.0"
|
||||
version = "0.34.0-beta.1"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
|
||||
28
Cargo.toml
28
Cargo.toml
@@ -13,20 +13,20 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.91.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=8.0.0-beta.19", default-features = false, "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=8.0.0-beta.19", default-features = false, "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=8.0.0-beta.19", default-features = false, "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=8.0.0-beta.19", "tag" = "v8.0.0-beta.19", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-core = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datagen = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-file = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-io = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-index = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-linalg = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-namespace-impls = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-table = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-testing = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-datafusion = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-encoding = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
lance-arrow = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
|
||||
ahash = "0.8"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "58.0.0", optional = false }
|
||||
|
||||
@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
|
||||
<dependency>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-core</artifactId>
|
||||
<version>0.31.0-beta.0</version>
|
||||
<version>0.31.0-beta.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.31.0-beta.0</version>
|
||||
<version>0.31.0-beta.1</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.31.0-beta.0</version>
|
||||
<version>0.31.0-beta.1</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
@@ -28,7 +28,7 @@
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<arrow.version>15.0.0</arrow.version>
|
||||
<lance-core.version>8.0.0-beta.19</lance-core.version>
|
||||
<lance-core.version>8.0.0-rc.1</lance-core.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<spotless.version>2.30.0</spotless.version>
|
||||
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.31.0-beta.0"
|
||||
version = "0.31.0-beta.1"
|
||||
publish = false
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.31.0-beta.0",
|
||||
"version": "0.31.0-beta.1",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -91,7 +91,9 @@ async def test_create_scalar_index(some_table: AsyncTable):
|
||||
# Can recreate if replace=True
|
||||
await some_table.create_index("id", replace=True)
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(BTree, columns=["id"], name="id_idx")]'
|
||||
assert str(indices).startswith(
|
||||
'[IndexConfig(name="id_idx", index_type="BTree", columns=["id"]'
|
||||
)
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "BTree"
|
||||
assert indices[0].columns == ["id"]
|
||||
@@ -106,6 +108,27 @@ async def test_create_scalar_index(some_table: AsyncTable):
|
||||
assert len(indices) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_index_config_repr(db_async):
|
||||
# Use >= 1000 rows so the thousands separator in the repr is exercised.
|
||||
nrows = 1500
|
||||
table = await db_async.create_table(
|
||||
"repr_table", pa.Table.from_pydict({"id": list(range(nrows))})
|
||||
)
|
||||
await table.create_index("id", config=BTree())
|
||||
indices = await table.list_indices()
|
||||
assert len(indices) == 1
|
||||
|
||||
r = repr(indices[0])
|
||||
assert r.startswith('IndexConfig(name="id_idx", index_type="BTree", columns=["id"]')
|
||||
# Integer counts use `_` thousands separators (valid Python int syntax).
|
||||
assert "num_indexed_rows=1_500" in r
|
||||
assert "num_unindexed_rows=0" in r
|
||||
# created_at renders as a datetime so the value round-trips.
|
||||
assert "created_at=datetime.datetime(" in r
|
||||
assert r.endswith(")")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
|
||||
metadata_type = pa.struct(
|
||||
@@ -198,7 +221,9 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
|
||||
async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
await some_table.create_index("fsb", config=BTree())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(BTree, columns=["fsb"], name="fsb_idx")]'
|
||||
assert str(indices).startswith(
|
||||
'[IndexConfig(name="fsb_idx", index_type="BTree", columns=["fsb"]'
|
||||
)
|
||||
assert len(indices) == 1
|
||||
assert indices[0].index_type == "BTree"
|
||||
assert indices[0].columns == ["fsb"]
|
||||
@@ -247,7 +272,9 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
async def test_create_label_list_index(some_table: AsyncTable):
|
||||
await some_table.create_index("tags", config=LabelList())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
assert str(indices).startswith(
|
||||
'[IndexConfig(name="tags_idx", index_type="LabelList", columns=["tags"]'
|
||||
)
|
||||
plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan()
|
||||
assert "ScalarIndexQuery" in plan
|
||||
|
||||
@@ -262,7 +289,9 @@ async def test_create_large_list_label_list_index(db_async):
|
||||
|
||||
await table.create_index("tags", config=LabelList())
|
||||
indices = await table.list_indices()
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
assert str(indices).startswith(
|
||||
'[IndexConfig(name="tags_idx", index_type="LabelList", columns=["tags"]'
|
||||
)
|
||||
plan = await table.query().where("array_has(tags, 'shared')").explain_plan()
|
||||
assert "ScalarIndexQuery" in plan
|
||||
|
||||
@@ -299,7 +328,9 @@ async def test_create_label_list_index_rejects_list_struct(db_async):
|
||||
async def test_full_text_search_index(some_table: AsyncTable):
|
||||
await some_table.create_index("tags", config=FTS(with_position=False))
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(FTS, columns=["tags"], name="tags_idx")]'
|
||||
assert str(indices).startswith(
|
||||
'[IndexConfig(name="tags_idx", index_type="FTS", columns=["tags"]'
|
||||
)
|
||||
|
||||
await some_table.prewarm_index("tags_idx")
|
||||
|
||||
|
||||
@@ -319,11 +319,53 @@ pub struct IndexConfig {
|
||||
|
||||
#[pymethods]
|
||||
impl IndexConfig {
|
||||
pub fn __repr__(&self) -> String {
|
||||
format!(
|
||||
"Index({}, columns={:?}, name=\"{}\")",
|
||||
self.index_type, self.columns, self.name
|
||||
)
|
||||
pub fn __repr__(&self, py: Python<'_>) -> String {
|
||||
let mut fields = vec![
|
||||
format!("name={:?}", self.name),
|
||||
format!("index_type={:?}", self.index_type),
|
||||
format!("columns={:?}", self.columns),
|
||||
];
|
||||
if let Some(v) = &self.index_uuid {
|
||||
fields.push(format!("index_uuid={:?}", v));
|
||||
}
|
||||
if let Some(v) = &self.type_url {
|
||||
fields.push(format!("type_url={:?}", v));
|
||||
}
|
||||
if let Some(v) = self.created_at {
|
||||
// Render the datetime's own Python repr so the value round-trips,
|
||||
// falling back to RFC 3339 if the conversion ever fails.
|
||||
let rendered = v
|
||||
.into_pyobject(py)
|
||||
.ok()
|
||||
.and_then(|obj| obj.into_any().repr().ok())
|
||||
.map(|r| r.to_string())
|
||||
.unwrap_or_else(|| v.to_rfc3339());
|
||||
fields.push(format!("created_at={}", rendered));
|
||||
}
|
||||
if let Some(v) = self.num_indexed_rows {
|
||||
fields.push(format!("num_indexed_rows={}", fmt_thousands(v)));
|
||||
}
|
||||
if let Some(v) = self.num_unindexed_rows {
|
||||
fields.push(format!("num_unindexed_rows={}", fmt_thousands(v)));
|
||||
}
|
||||
if let Some(v) = self.size_bytes {
|
||||
fields.push(format!("size_bytes={}", fmt_thousands(v)));
|
||||
}
|
||||
if let Some(v) = self.num_segments {
|
||||
fields.push(format!("num_segments={}", v));
|
||||
}
|
||||
if let Some(v) = self.index_version {
|
||||
fields.push(format!("index_version={}", v));
|
||||
}
|
||||
if let Some(v) = &self.index_details {
|
||||
let details = v
|
||||
.bind(py)
|
||||
.repr()
|
||||
.map(|r| r.to_string())
|
||||
.unwrap_or_else(|_| "<unavailable>".to_string());
|
||||
fields.push(format!("index_details={}", details));
|
||||
}
|
||||
format!("IndexConfig({})", fields.join(", "))
|
||||
}
|
||||
|
||||
// For backwards-compatibility with the old sync SDK, we also support getting
|
||||
@@ -352,6 +394,23 @@ impl IndexConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/// Format an integer with `_` thousands separators, e.g. `24_500_213`.
|
||||
///
|
||||
/// Underscores are valid Python int-literal syntax, so the repr stays
|
||||
/// copy-pasteable and machine-parseable while remaining readable.
|
||||
fn fmt_thousands(n: u64) -> String {
|
||||
let digits = n.to_string();
|
||||
let bytes = digits.as_bytes();
|
||||
let mut out = String::with_capacity(digits.len() + digits.len() / 3);
|
||||
for (i, b) in bytes.iter().enumerate() {
|
||||
if i > 0 && (bytes.len() - i).is_multiple_of(3) {
|
||||
out.push('_');
|
||||
}
|
||||
out.push(*b as char);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn parse_index_details(py: Python<'_>, s: String) -> Py<PyAny> {
|
||||
let json = py.import("json").expect("json module is always available");
|
||||
match json.call_method1("loads", (s.as_str(),)) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.31.0-beta.0"
|
||||
version = "0.31.0-beta.1"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
126
rust/lancedb/src/blob.rs
Normal file
126
rust/lancedb/src/blob.rs
Normal file
@@ -0,0 +1,126 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! Lance blob v2 columns store large binary payloads out of line.
|
||||
//!
|
||||
//! Declare a column with [`blob`]. On write, [`crate::table::Table::add`] coerces
|
||||
//! raw `Binary` / `LargeBinary` into the blob struct layout. Queries return
|
||||
//! small descriptors, not bytes.
|
||||
//!
|
||||
//! Blob tables require Lance file format >= 2.2 and stable row ids at create.
|
||||
|
||||
use arrow_schema::{Field, Schema};
|
||||
use lance::dataset::WriteParams;
|
||||
use lance_arrow::FieldExt;
|
||||
use lance_encoding::version::LanceFileVersion;
|
||||
|
||||
/// Creates an Arrow field for a Lance blob v2 column.
|
||||
///
|
||||
/// `Struct<data, uri>` with the `lance.blob.v2` marker. Same layout Lance
|
||||
/// expects on write.
|
||||
///
|
||||
/// ```
|
||||
/// use arrow_schema::{DataType, Field, Schema};
|
||||
///
|
||||
/// let schema = Schema::new(vec![
|
||||
/// Field::new("id", DataType::Int64, false),
|
||||
/// lancedb::blob("image", true),
|
||||
/// ]);
|
||||
/// ```
|
||||
///
|
||||
/// Blob tables use Lance file format >= 2.2 and stable row ids at create.
|
||||
pub fn blob(name: impl AsRef<str>, nullable: bool) -> Field {
|
||||
lance::blob::blob_field(name.as_ref(), nullable)
|
||||
}
|
||||
|
||||
/// Returns true if `schema` declares any blob v2 column.
|
||||
pub(crate) fn has_blob_columns(schema: &Schema) -> bool {
|
||||
schema.fields().iter().any(|field| field.is_blob_v2())
|
||||
}
|
||||
|
||||
/// Bumps storage format to at least [`LanceFileVersion::V2_2`] for blob schemas.
|
||||
pub(crate) fn ensure_blob_storage_version(schema: &Schema, params: &mut WriteParams) {
|
||||
if !has_blob_columns(schema) {
|
||||
return;
|
||||
}
|
||||
|
||||
let resolved = params
|
||||
.data_storage_version
|
||||
.unwrap_or(LanceFileVersion::Stable)
|
||||
.resolve();
|
||||
if resolved < LanceFileVersion::V2_2 {
|
||||
params.data_storage_version = Some(LanceFileVersion::V2_2);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use arrow_schema::DataType;
|
||||
use lance_arrow::ARROW_EXT_NAME_KEY;
|
||||
|
||||
fn blob_schema() -> Schema {
|
||||
Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
blob("image", true),
|
||||
])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blob_field_carries_v2_extension_marker() {
|
||||
let field = blob("image", true);
|
||||
assert_eq!(
|
||||
field.metadata().get(ARROW_EXT_NAME_KEY).map(String::as_str),
|
||||
Some("lance.blob.v2")
|
||||
);
|
||||
assert!(matches!(field.data_type(), DataType::Struct(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn has_blob_columns_detects_blob_fields() {
|
||||
assert!(has_blob_columns(&blob_schema()));
|
||||
let plain = Schema::new(vec![Field::new("id", DataType::Int64, false)]);
|
||||
assert!(!has_blob_columns(&plain));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_version_bumps_to_v2_2() {
|
||||
let mut params = WriteParams::default();
|
||||
ensure_blob_storage_version(&blob_schema(), &mut params);
|
||||
assert_eq!(
|
||||
params.data_storage_version.unwrap().resolve(),
|
||||
LanceFileVersion::V2_2
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_version_overrides_lower_explicit_version() {
|
||||
let mut params = WriteParams {
|
||||
data_storage_version: Some(LanceFileVersion::V2_0),
|
||||
..Default::default()
|
||||
};
|
||||
ensure_blob_storage_version(&blob_schema(), &mut params);
|
||||
assert_eq!(
|
||||
params.data_storage_version.unwrap().resolve(),
|
||||
LanceFileVersion::V2_2
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_version_keeps_higher_explicit_version() {
|
||||
let mut params = WriteParams {
|
||||
data_storage_version: Some(LanceFileVersion::V2_3),
|
||||
..Default::default()
|
||||
};
|
||||
ensure_blob_storage_version(&blob_schema(), &mut params);
|
||||
assert_eq!(params.data_storage_version.unwrap(), LanceFileVersion::V2_3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_version_noop_without_blob_columns() {
|
||||
let schema = Schema::new(vec![Field::new("id", DataType::Int64, false)]);
|
||||
let mut params = WriteParams::default();
|
||||
ensure_blob_storage_version(&schema, &mut params);
|
||||
assert!(params.data_storage_version.is_none());
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,7 @@ use lance_table::io::commit::commit_handler_from_url;
|
||||
use object_store::local::LocalFileSystem;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::blob::{ensure_blob_storage_version, has_blob_columns};
|
||||
use crate::connection::ConnectRequest;
|
||||
use crate::database::ReadConsistency;
|
||||
use crate::database::namespace::LanceNamespaceDatabase;
|
||||
@@ -838,13 +839,16 @@ impl ListingDatabase {
|
||||
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||
}
|
||||
|
||||
// Apply enable_stable_row_ids: table-level override takes precedence over connection config
|
||||
if let Some(enable_stable_row_ids) =
|
||||
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids)
|
||||
let data_schema = request.data.arrow_schema();
|
||||
if let Some(enable_stable_row_ids) = stable_row_ids_override
|
||||
.or(self.new_table_config.enable_stable_row_ids)
|
||||
.or(has_blob_columns(&data_schema).then_some(true))
|
||||
{
|
||||
write_params.enable_stable_row_ids = enable_stable_row_ids;
|
||||
}
|
||||
|
||||
ensure_blob_storage_version(&data_schema, &mut write_params);
|
||||
|
||||
if matches!(&request.mode, CreateTableMode::Overwrite) {
|
||||
write_params.mode = WriteMode::Overwrite;
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use lance_namespace_impls::ConnectBuilder;
|
||||
use lance_table::io::commit::CommitHandler;
|
||||
use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler;
|
||||
|
||||
use crate::blob::{ensure_blob_storage_version, has_blob_columns};
|
||||
use crate::connection::NamespaceClientPushdownOperation;
|
||||
use crate::database::ReadConsistency;
|
||||
use crate::database::listing::{
|
||||
@@ -257,12 +258,16 @@ impl LanceNamespaceDatabase {
|
||||
params.enable_v2_manifest_paths = enable_v2_manifest_paths;
|
||||
}
|
||||
|
||||
if let Some(enable_stable_row_ids) =
|
||||
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids)
|
||||
let data_schema = request.data.schema();
|
||||
if let Some(enable_stable_row_ids) = stable_row_ids_override
|
||||
.or(self.new_table_config.enable_stable_row_ids)
|
||||
.or(has_blob_columns(data_schema.as_ref()).then_some(true))
|
||||
{
|
||||
params.enable_stable_row_ids = enable_stable_row_ids;
|
||||
}
|
||||
|
||||
ensure_blob_storage_version(data_schema.as_ref(), params);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,6 +163,7 @@
|
||||
//! ```
|
||||
|
||||
pub mod arrow;
|
||||
pub mod blob;
|
||||
pub mod connection;
|
||||
pub mod data;
|
||||
pub mod database;
|
||||
@@ -188,6 +189,7 @@ use std::{fmt::Display, str::FromStr};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use blob::blob;
|
||||
pub use connection::{ConnectNamespaceBuilder, Connection};
|
||||
pub use error::{Error, Result};
|
||||
use lance_index::vector::ApproxMode as LanceApproxMode;
|
||||
|
||||
@@ -1352,6 +1352,35 @@ impl<S: HttpSend + 'static> RemoteTable<S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserialize an index's `created_at` field.
|
||||
///
|
||||
/// The server returns this as an RFC 3339 string (e.g. `"2026-06-18T21:37:36.637Z"`),
|
||||
/// but older deployments sent a unix timestamp in milliseconds. Accept both so the
|
||||
/// client works against any server version.
|
||||
fn deserialize_created_at<'de, D>(
|
||||
deserializer: D,
|
||||
) -> std::result::Result<Option<DateTime<Utc>>, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
use serde::de::Error as _;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum CreatedAt {
|
||||
Rfc3339(String),
|
||||
Millis(i64),
|
||||
}
|
||||
|
||||
match Option::<CreatedAt>::deserialize(deserializer)? {
|
||||
None => Ok(None),
|
||||
Some(CreatedAt::Rfc3339(s)) => DateTime::parse_from_rfc3339(&s)
|
||||
.map(|dt| Some(dt.with_timezone(&Utc)))
|
||||
.map_err(D::Error::custom),
|
||||
Some(CreatedAt::Millis(ms)) => Ok(DateTime::from_timestamp_millis(ms)),
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: HttpSend + 'static> RemoteTable<S> {
|
||||
/// Parse the response from `/index/list/` into `IndexConfig` entries.
|
||||
///
|
||||
@@ -1380,7 +1409,7 @@ impl<S: HttpSend + 'static> RemoteTable<S> {
|
||||
// Used as the sentinel to decide whether to skip the stats call.
|
||||
index_type: Option<IndexType>,
|
||||
index_uuid: Option<String>,
|
||||
#[serde(default, with = "chrono::serde::ts_milliseconds_option")]
|
||||
#[serde(default, deserialize_with = "deserialize_created_at")]
|
||||
created_at: Option<DateTime<Utc>>,
|
||||
num_indexed_rows: Option<u64>,
|
||||
num_unindexed_rows: Option<u64>,
|
||||
@@ -4678,7 +4707,7 @@ mod tests {
|
||||
"num_segments": 2,
|
||||
"index_version": 1,
|
||||
"index_details": "{\"num_partitions\":16}",
|
||||
"created_at": 1700000000000i64,
|
||||
"created_at": "2026-06-18T21:37:36.637Z",
|
||||
"type_url": "type.googleapis.com/lance.index.vector.IvfPq",
|
||||
},
|
||||
{
|
||||
@@ -4728,7 +4757,10 @@ mod tests {
|
||||
vec_idx.type_url,
|
||||
Some("type.googleapis.com/lance.index.vector.IvfPq".to_string())
|
||||
);
|
||||
assert!(vec_idx.created_at.is_some());
|
||||
assert_eq!(
|
||||
vec_idx.created_at,
|
||||
Some("2026-06-18T21:37:36.637Z".parse::<DateTime<Utc>>().unwrap())
|
||||
);
|
||||
|
||||
let text_idx = &indices[1];
|
||||
assert_eq!(text_idx.name, "text_idx");
|
||||
@@ -4749,6 +4781,36 @@ mod tests {
|
||||
assert_eq!(text_idx.created_at, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_created_at() {
|
||||
#[derive(Deserialize)]
|
||||
struct Wrapper {
|
||||
#[serde(default, deserialize_with = "deserialize_created_at")]
|
||||
created_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
// RFC 3339 string (current server format).
|
||||
let w: Wrapper =
|
||||
serde_json::from_str(r#"{"created_at": "2026-06-18T21:37:36.637Z"}"#).unwrap();
|
||||
assert_eq!(
|
||||
w.created_at,
|
||||
Some("2026-06-18T21:37:36.637Z".parse::<DateTime<Utc>>().unwrap())
|
||||
);
|
||||
|
||||
// Unix milliseconds (legacy server format).
|
||||
let w: Wrapper = serde_json::from_str(r#"{"created_at": 1700000000000}"#).unwrap();
|
||||
assert_eq!(w.created_at, DateTime::from_timestamp_millis(1700000000000));
|
||||
|
||||
// Null and missing both yield None.
|
||||
let w: Wrapper = serde_json::from_str(r#"{"created_at": null}"#).unwrap();
|
||||
assert_eq!(w.created_at, None);
|
||||
let w: Wrapper = serde_json::from_str(r#"{}"#).unwrap();
|
||||
assert_eq!(w.created_at, None);
|
||||
|
||||
// A malformed string is rejected rather than silently dropped to None.
|
||||
assert!(serde_json::from_str::<Wrapper>(r#"{"created_at": "not-a-date"}"#).is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_list_versions() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
|
||||
@@ -26,6 +26,9 @@ pub enum AddDataMode {
|
||||
#[default]
|
||||
Append,
|
||||
/// The existing table will be overwritten with the new data
|
||||
///
|
||||
/// On overwrite, raw binary is not coerced into a blob struct. The input
|
||||
/// must declare blob v2 for the column to stay a blob column.
|
||||
Overwrite,
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
//! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers.
|
||||
|
||||
mod blob_coerce;
|
||||
pub mod cast;
|
||||
pub mod insert;
|
||||
pub mod reject_nan;
|
||||
|
||||
495
rust/lancedb/src/table/datafusion/blob_coerce.rs
Normal file
495
rust/lancedb/src/table/datafusion/blob_coerce.rs
Normal file
@@ -0,0 +1,495 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! Coerces write-path input into blob v2 struct columns.
|
||||
//!
|
||||
//! [`super::cast::cast_to_table_schema`] calls [`coerce_blob_expr`].
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{DataType, Field, FieldRef};
|
||||
use datafusion::functions::core::{get_field, named_struct};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_common::config::ConfigOptions;
|
||||
use datafusion_physical_expr::ScalarFunctionExpr;
|
||||
use datafusion_physical_expr::expressions::{CastExpr, Literal};
|
||||
use datafusion_physical_plan::PhysicalExpr;
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
/// Build a projection expression coercing `input_expr` into the blob struct
|
||||
/// declared by `table_field`, composing `named_struct` / `get_field` / `cast`.
|
||||
pub(super) fn coerce_blob_expr(
|
||||
input_expr: Arc<dyn PhysicalExpr>,
|
||||
input_field: &Field,
|
||||
table_field: &FieldRef,
|
||||
config: &Arc<ConfigOptions>,
|
||||
) -> Result<(Arc<dyn PhysicalExpr>, FieldRef)> {
|
||||
let DataType::Struct(declared_fields) = table_field.data_type() else {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"blob v2 column '{}' must be a struct, table declares {}",
|
||||
table_field.name(),
|
||||
table_field.data_type()
|
||||
),
|
||||
});
|
||||
};
|
||||
|
||||
let input_struct_children = match input_field.data_type() {
|
||||
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
|
||||
DataType::Struct(children) => {
|
||||
if !children
|
||||
.iter()
|
||||
.any(|c| c.name() == "data" || c.name() == "uri")
|
||||
{
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"blob struct input for column '{}' must contain a 'data' or 'uri' child",
|
||||
table_field.name()
|
||||
),
|
||||
});
|
||||
}
|
||||
Some(children)
|
||||
}
|
||||
other => {
|
||||
return Err(Error::InvalidInput {
|
||||
message: format!(
|
||||
"cannot coerce column '{}' with type {} into a blob v2 struct. \
|
||||
expected Binary, LargeBinary, BinaryView, or a Struct with a 'data' or 'uri' child",
|
||||
table_field.name(),
|
||||
other,
|
||||
),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let mut ns_args: Vec<Arc<dyn PhysicalExpr>> = Vec::with_capacity(declared_fields.len() * 2);
|
||||
for declared in declared_fields.iter() {
|
||||
ns_args.push(Arc::new(Literal::new(ScalarValue::from(
|
||||
declared.name().as_str(),
|
||||
))));
|
||||
|
||||
let value: Arc<dyn PhysicalExpr> = match input_struct_children {
|
||||
// Raw binary lands in `data` and everything else is a typed null.
|
||||
None => {
|
||||
if declared.name() == "data" {
|
||||
Arc::new(CastExpr::new(
|
||||
input_expr.clone(),
|
||||
declared.data_type().clone(),
|
||||
None,
|
||||
))
|
||||
} else {
|
||||
typed_null(declared.data_type())?
|
||||
}
|
||||
}
|
||||
Some(children) => match children.iter().find(|c| c.name() == declared.name()) {
|
||||
Some(child) => {
|
||||
let field_expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
|
||||
&format!("get_field({})", declared.name()),
|
||||
get_field(),
|
||||
vec![
|
||||
input_expr.clone(),
|
||||
Arc::new(Literal::new(ScalarValue::from(declared.name().as_str()))),
|
||||
],
|
||||
Arc::new(child.as_ref().clone()),
|
||||
config.clone(),
|
||||
));
|
||||
if child.data_type() == declared.data_type() {
|
||||
field_expr
|
||||
} else {
|
||||
Arc::new(CastExpr::new(
|
||||
field_expr,
|
||||
declared.data_type().clone(),
|
||||
None,
|
||||
))
|
||||
}
|
||||
}
|
||||
None => typed_null(declared.data_type())?,
|
||||
},
|
||||
};
|
||||
ns_args.push(value);
|
||||
}
|
||||
|
||||
let expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
|
||||
&format!("named_struct({})", table_field.name()),
|
||||
named_struct(),
|
||||
ns_args,
|
||||
table_field.clone(),
|
||||
config.clone(),
|
||||
));
|
||||
Ok((expr, table_field.clone()))
|
||||
}
|
||||
|
||||
fn typed_null(data_type: &DataType) -> Result<Arc<dyn PhysicalExpr>> {
|
||||
let scalar = ScalarValue::try_from(data_type).map_err(|e| Error::InvalidInput {
|
||||
message: format!("cannot build null literal for blob child type {data_type}: {e}"),
|
||||
})?;
|
||||
Ok(Arc::new(Literal::new(scalar)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::cast::cast_to_table_schema;
|
||||
use super::*;
|
||||
use crate::blob::blob;
|
||||
use arrow_array::{
|
||||
Array, ArrayRef, BinaryArray, BinaryViewArray, Int32Array, Int64Array, LargeBinaryArray,
|
||||
RecordBatch, StringArray, StructArray, UInt8Array, UInt64Array,
|
||||
};
|
||||
use arrow_schema::Schema;
|
||||
use datafusion::prelude::SessionContext;
|
||||
use datafusion_catalog::MemTable;
|
||||
use datafusion_physical_plan::ExecutionPlan;
|
||||
use futures::TryStreamExt;
|
||||
use lance_arrow::FieldExt;
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn wide_blob_field(name: &str) -> Field {
|
||||
Field::new(
|
||||
name,
|
||||
DataType::Struct(
|
||||
vec![
|
||||
Field::new("data", DataType::LargeBinary, true),
|
||||
Field::new("uri", DataType::Utf8, true),
|
||||
Field::new("position", DataType::UInt64, true),
|
||||
Field::new("size", DataType::UInt64, true),
|
||||
]
|
||||
.into(),
|
||||
),
|
||||
true,
|
||||
)
|
||||
.with_metadata(HashMap::from([(
|
||||
"ARROW:extension:name".to_string(),
|
||||
"lance.blob.v2".to_string(),
|
||||
)]))
|
||||
}
|
||||
|
||||
fn blob_table_schema() -> Schema {
|
||||
Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
blob("image", true),
|
||||
])
|
||||
}
|
||||
|
||||
fn batch_with_image(image_field: Field, image: ArrayRef) -> RecordBatch {
|
||||
let len = image.len();
|
||||
RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
image_field,
|
||||
])),
|
||||
vec![Arc::new(Int64Array::from_iter_values(0..len as i64)), image],
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn image_struct(batch: &RecordBatch) -> &StructArray {
|
||||
batch
|
||||
.column_by_name("image")
|
||||
.unwrap()
|
||||
.as_any()
|
||||
.downcast_ref::<StructArray>()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn plan_from_batch(batch: RecordBatch) -> Arc<dyn ExecutionPlan> {
|
||||
let schema = batch.schema();
|
||||
let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
|
||||
let ctx = SessionContext::new();
|
||||
ctx.register_table("t", Arc::new(table)).unwrap();
|
||||
let df = ctx.table("t").await.unwrap();
|
||||
df.create_physical_plan().await.unwrap()
|
||||
}
|
||||
|
||||
async fn coerce(batch: RecordBatch, table_schema: &Schema) -> RecordBatch {
|
||||
let plan = plan_from_batch(batch).await;
|
||||
let plan = cast_to_table_schema(plan, table_schema).unwrap();
|
||||
let ctx = SessionContext::new();
|
||||
let stream = plan.execute(0, ctx.task_ctx()).unwrap();
|
||||
let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
|
||||
arrow_select::concat::concat_batches(&plan.schema(), &batches).unwrap()
|
||||
}
|
||||
|
||||
async fn coerce_err(batch: RecordBatch, table_schema: &Schema) -> Error {
|
||||
let plan = plan_from_batch(batch).await;
|
||||
cast_to_table_schema(plan, table_schema).unwrap_err()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn large_binary_coerces_to_declared_blob_struct() {
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", DataType::LargeBinary, true),
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"hello".as_slice()])),
|
||||
);
|
||||
let coerced = coerce(batch, &blob_table_schema()).await;
|
||||
let image_field = coerced.schema().field_with_name("image").unwrap().clone();
|
||||
assert!(image_field.is_blob_v2());
|
||||
assert!(matches!(image_field.data_type(), DataType::Struct(_)));
|
||||
let data = image_struct(&coerced).column_by_name("data").unwrap();
|
||||
let data: &LargeBinaryArray = data.as_any().downcast_ref().unwrap();
|
||||
assert_eq!(data.value(0), b"hello");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn binary_coerces_to_declared_blob_struct() {
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", DataType::Binary, true),
|
||||
Arc::new(BinaryArray::from_iter_values([b"hi".as_slice()])),
|
||||
);
|
||||
let coerced = coerce(batch, &blob_table_schema()).await;
|
||||
assert!(
|
||||
coerced
|
||||
.schema()
|
||||
.field_with_name("image")
|
||||
.unwrap()
|
||||
.is_blob_v2()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn binary_view_coerces_to_declared_blob_struct() {
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", DataType::BinaryView, true),
|
||||
Arc::new(BinaryViewArray::from_iter_values([b"view".as_slice()])),
|
||||
);
|
||||
let coerced = coerce(batch, &blob_table_schema()).await;
|
||||
let data = image_struct(&coerced).column_by_name("data").unwrap();
|
||||
let data: &LargeBinaryArray = data.as_any().downcast_ref().unwrap();
|
||||
assert_eq!(data.value(0), b"view");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn binary_nulls_stay_null_after_coercion() {
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", DataType::Binary, true),
|
||||
Arc::new(BinaryArray::from_iter(vec![
|
||||
Some(b"present".as_slice()),
|
||||
None,
|
||||
])),
|
||||
);
|
||||
let coerced = coerce(batch, &blob_table_schema()).await;
|
||||
let image = image_struct(&coerced);
|
||||
let data = image.column_by_name("data").unwrap();
|
||||
assert!(!data.is_null(0));
|
||||
assert!(data.is_null(1));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn binary_coerces_into_four_child_blob_layout() {
|
||||
let table_schema = Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
wide_blob_field("image"),
|
||||
]);
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", DataType::LargeBinary, true),
|
||||
Arc::new(LargeBinaryArray::from_iter(vec![
|
||||
Some(b"alpha".as_slice()),
|
||||
None,
|
||||
])),
|
||||
);
|
||||
let coerced = coerce(batch, &table_schema).await;
|
||||
let image = image_struct(&coerced);
|
||||
assert_eq!(
|
||||
image.num_columns(),
|
||||
4,
|
||||
"coerced struct keeps the declared layout"
|
||||
);
|
||||
assert!(image.column_by_name("position").unwrap().is_null(0));
|
||||
assert!(image.column_by_name("size").unwrap().is_null(0));
|
||||
assert!(!image.column_by_name("data").unwrap().is_null(0));
|
||||
assert!(image.column_by_name("data").unwrap().is_null(1));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn prebuilt_struct_gains_blob_field_metadata() {
|
||||
let DataType::Struct(children) = blob("image", true).data_type().clone() else {
|
||||
unreachable!("blob field is a struct")
|
||||
};
|
||||
let prebuilt = StructArray::new(
|
||||
children,
|
||||
vec![
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"prebuilt".as_slice()])),
|
||||
Arc::new(StringArray::from(vec![None::<&str>])),
|
||||
],
|
||||
None,
|
||||
);
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", prebuilt.data_type().clone(), true),
|
||||
Arc::new(prebuilt),
|
||||
);
|
||||
let coerced = coerce(batch, &blob_table_schema()).await;
|
||||
assert!(
|
||||
coerced
|
||||
.schema()
|
||||
.field_with_name("image")
|
||||
.unwrap()
|
||||
.is_blob_v2()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn prebuilt_narrow_struct_widens_to_declared_layout() {
|
||||
let DataType::Struct(narrow_children) = blob("image", true).data_type().clone() else {
|
||||
unreachable!("blob field is a struct")
|
||||
};
|
||||
let prebuilt = StructArray::new(
|
||||
narrow_children,
|
||||
vec![
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"prebuilt".as_slice()])),
|
||||
Arc::new(StringArray::from(vec![None::<&str>])),
|
||||
],
|
||||
None,
|
||||
);
|
||||
let table_schema = Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
wide_blob_field("image"),
|
||||
]);
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", prebuilt.data_type().clone(), true),
|
||||
Arc::new(prebuilt),
|
||||
);
|
||||
let coerced = coerce(batch, &table_schema).await;
|
||||
let image = image_struct(&coerced);
|
||||
assert_eq!(image.num_columns(), 4);
|
||||
assert!(image.column_by_name("position").unwrap().is_null(0));
|
||||
assert!(image.column_by_name("size").unwrap().is_null(0));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn external_reference_struct_preserves_uri_position_and_size() {
|
||||
let prebuilt = StructArray::new(
|
||||
vec![
|
||||
Field::new("data", DataType::LargeBinary, true),
|
||||
Field::new("uri", DataType::Utf8, true),
|
||||
Field::new("position", DataType::UInt64, true),
|
||||
Field::new("size", DataType::UInt64, true),
|
||||
]
|
||||
.into(),
|
||||
vec![
|
||||
Arc::new(LargeBinaryArray::from(vec![None::<&[u8]>])) as ArrayRef,
|
||||
Arc::new(StringArray::from(vec![Some("s3://bucket/blob.bin")])) as ArrayRef,
|
||||
Arc::new(UInt64Array::from(vec![Some(7)])) as ArrayRef,
|
||||
Arc::new(UInt64Array::from(vec![Some(6)])) as ArrayRef,
|
||||
],
|
||||
None,
|
||||
);
|
||||
let table_schema = Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
wide_blob_field("image"),
|
||||
]);
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", prebuilt.data_type().clone(), true),
|
||||
Arc::new(prebuilt),
|
||||
);
|
||||
let coerced = coerce(batch, &table_schema).await;
|
||||
let image = image_struct(&coerced);
|
||||
|
||||
let uri: &StringArray = image
|
||||
.column_by_name("uri")
|
||||
.unwrap()
|
||||
.as_any()
|
||||
.downcast_ref()
|
||||
.unwrap();
|
||||
assert_eq!(uri.value(0), "s3://bucket/blob.bin");
|
||||
let position: &UInt64Array = image
|
||||
.column_by_name("position")
|
||||
.unwrap()
|
||||
.as_any()
|
||||
.downcast_ref()
|
||||
.unwrap();
|
||||
assert_eq!(position.value(0), 7);
|
||||
let size: &UInt64Array = image
|
||||
.column_by_name("size")
|
||||
.unwrap()
|
||||
.as_any()
|
||||
.downcast_ref()
|
||||
.unwrap();
|
||||
assert_eq!(size.value(0), 6);
|
||||
assert!(image.column_by_name("data").unwrap().is_null(0));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn descriptor_struct_without_value_child_is_rejected() {
|
||||
let descriptor = StructArray::new(
|
||||
vec![
|
||||
Field::new("kind", DataType::UInt8, false),
|
||||
Field::new("position", DataType::UInt64, false),
|
||||
Field::new("size", DataType::UInt64, false),
|
||||
]
|
||||
.into(),
|
||||
vec![
|
||||
Arc::new(UInt8Array::from(vec![0])),
|
||||
Arc::new(UInt64Array::from(vec![0])),
|
||||
Arc::new(UInt64Array::from(vec![0])),
|
||||
],
|
||||
None,
|
||||
);
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", descriptor.data_type().clone(), true),
|
||||
Arc::new(descriptor),
|
||||
);
|
||||
let err = coerce_err(batch, &blob_table_schema()).await;
|
||||
assert!(err.to_string().contains("'data' or 'uri'"));
|
||||
assert!(err.to_string().contains("image"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn unsupported_input_type_is_rejected_with_column_name() {
|
||||
let batch = batch_with_image(
|
||||
Field::new("image", DataType::Utf8, true),
|
||||
Arc::new(StringArray::from(vec!["not bytes"])),
|
||||
);
|
||||
let err = coerce_err(batch, &blob_table_schema()).await;
|
||||
assert!(matches!(err, Error::InvalidInput { .. }), "got {err:?}");
|
||||
assert!(err.to_string().contains("image"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn blob_metadata_survives_cast_of_sibling_column() {
|
||||
let batch = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("image", DataType::LargeBinary, true),
|
||||
])),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![1])),
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"x".as_slice()])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let coerced = coerce(batch, &blob_table_schema()).await;
|
||||
|
||||
let image_field = coerced.schema().field_with_name("image").unwrap().clone();
|
||||
assert!(
|
||||
image_field.is_blob_v2(),
|
||||
"expected blob marker on image field, got {:?}",
|
||||
image_field.metadata()
|
||||
);
|
||||
assert_eq!(
|
||||
coerced.schema().field_with_name("id").unwrap().data_type(),
|
||||
&DataType::Int64
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn exact_blob_input_passes_through_unchanged() {
|
||||
let DataType::Struct(children) = blob("image", true).data_type().clone() else {
|
||||
unreachable!("blob field is a struct")
|
||||
};
|
||||
let image = StructArray::new(
|
||||
children,
|
||||
vec![
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"exact".as_slice()])),
|
||||
Arc::new(StringArray::from(vec![None::<&str>])),
|
||||
],
|
||||
None,
|
||||
);
|
||||
let batch = batch_with_image(blob("image", true), Arc::new(image));
|
||||
let table_schema = blob_table_schema();
|
||||
|
||||
let input = plan_from_batch(batch).await;
|
||||
let input_ptr = Arc::as_ptr(&input);
|
||||
let plan = cast_to_table_schema(input, &table_schema).unwrap();
|
||||
assert_eq!(Arc::as_ptr(&plan), input_ptr, "no projection inserted");
|
||||
}
|
||||
}
|
||||
@@ -13,8 +13,10 @@ use datafusion_physical_expr::expressions::{CastExpr, Literal};
|
||||
use datafusion_physical_plan::expressions::Column;
|
||||
use datafusion_physical_plan::projection::ProjectionExec;
|
||||
use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
|
||||
use lance_arrow::FieldExt;
|
||||
use lance_arrow::json::{is_arrow_json_field, is_json_field};
|
||||
|
||||
use super::blob_coerce::coerce_blob_expr;
|
||||
use crate::{Error, Result};
|
||||
|
||||
pub fn cast_to_table_schema(
|
||||
@@ -77,6 +79,17 @@ fn build_field_exprs(
|
||||
continue;
|
||||
}
|
||||
|
||||
// Blob columns accept raw binary on write; exact matches pass through below.
|
||||
if table_field.is_blob_v2() && input_field.as_ref() != table_field.as_ref() {
|
||||
result.push(coerce_blob_expr(
|
||||
input_expr,
|
||||
input_field,
|
||||
table_field,
|
||||
&config,
|
||||
)?);
|
||||
continue;
|
||||
}
|
||||
|
||||
let expr = match (input_field.data_type(), table_field.data_type()) {
|
||||
// Both are structs: recurse into sub-fields to handle subschemas and casts.
|
||||
(DataType::Struct(in_children), DataType::Struct(tbl_children))
|
||||
|
||||
380
rust/lancedb/tests/blob_integration.rs
Normal file
380
rust/lancedb/tests/blob_integration.rs
Normal file
@@ -0,0 +1,380 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! Integration tests for blob v2 columns.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::{Array, BinaryArray, Int64Array, LargeBinaryArray, RecordBatch, StructArray};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use futures::TryStreamExt;
|
||||
use lance_encoding::version::LanceFileVersion;
|
||||
use lancedb::{
|
||||
Connection, Result, Table, blob::blob, connect,
|
||||
database::listing::OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, query::ExecutableQuery,
|
||||
};
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn blob_table_schema() -> Arc<Schema> {
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
blob("image", true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn binary_input_batch(ids: &[i64], payloads: &[Option<&[u8]>]) -> RecordBatch {
|
||||
RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
Field::new("image", DataType::LargeBinary, true),
|
||||
])),
|
||||
vec![
|
||||
Arc::new(Int64Array::from(ids.to_vec())),
|
||||
Arc::new(LargeBinaryArray::from_iter(payloads.iter().copied())),
|
||||
],
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn create_inline_blob_table(
|
||||
db: &Connection,
|
||||
name: &str,
|
||||
ids: &[i64],
|
||||
payloads: &[Option<&[u8]>],
|
||||
) -> Result<Table> {
|
||||
let table = db
|
||||
.create_empty_table(name, blob_table_schema())
|
||||
.execute()
|
||||
.await?;
|
||||
table
|
||||
.add(binary_input_batch(ids, payloads))
|
||||
.execute()
|
||||
.await?;
|
||||
Ok(table)
|
||||
}
|
||||
|
||||
async fn storage_format_version(table: &Table) -> LanceFileVersion {
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.manifest()
|
||||
.await
|
||||
.unwrap()
|
||||
.data_storage_format
|
||||
.lance_file_version()
|
||||
.unwrap()
|
||||
.resolve()
|
||||
}
|
||||
|
||||
async fn uses_stable_row_ids(table: &Table) -> bool {
|
||||
table
|
||||
.as_native()
|
||||
.unwrap()
|
||||
.manifest()
|
||||
.await
|
||||
.unwrap()
|
||||
.uses_stable_row_ids()
|
||||
}
|
||||
|
||||
async fn query_image_struct(table: &Table) -> StructArray {
|
||||
let batches = table
|
||||
.query()
|
||||
.execute()
|
||||
.await
|
||||
.unwrap()
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
let batch = arrow_select::concat::concat_batches(&batches[0].schema(), &batches).unwrap();
|
||||
batch
|
||||
.column_by_name("image")
|
||||
.expect("image column present")
|
||||
.as_any()
|
||||
.downcast_ref::<StructArray>()
|
||||
.expect("blob column reads back as a descriptor struct")
|
||||
.clone()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn declaring_blob_column_bumps_format_and_enables_stable_row_ids() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table = db
|
||||
.create_empty_table("t", blob_table_schema())
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
|
||||
assert!(uses_stable_row_ids(&table).await);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn explicit_stable_row_id_setting_wins_over_blob_default() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table = db
|
||||
.create_empty_table("t", blob_table_schema())
|
||||
.storage_option(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, "false")
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
assert!(
|
||||
storage_format_version(&table).await >= LanceFileVersion::V2_2,
|
||||
"format bump still applies; the schema cannot be written below 2.2"
|
||||
);
|
||||
assert!(!uses_stable_row_ids(&table).await);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn non_blob_table_keeps_default_format_and_row_id_setting() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
|
||||
let table = db.create_empty_table("t", schema).execute().await?;
|
||||
|
||||
assert!(storage_format_version(&table).await < LanceFileVersion::V2_2);
|
||||
assert!(!uses_stable_row_ids(&table).await);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn creating_with_blob_data_bumps_format() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
|
||||
// Batch already declares the blob field (pre-built struct).
|
||||
let blob_field = blob("image", true);
|
||||
let DataType::Struct(children) = blob_field.data_type().clone() else {
|
||||
unreachable!("blob field is a struct")
|
||||
};
|
||||
let image = StructArray::new(
|
||||
children,
|
||||
vec![
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"payload".as_slice()])),
|
||||
Arc::new(arrow_array::StringArray::from(vec![None::<&str>])),
|
||||
],
|
||||
None,
|
||||
);
|
||||
let batch = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
blob_field,
|
||||
])),
|
||||
vec![Arc::new(Int64Array::from(vec![1])), Arc::new(image)],
|
||||
)
|
||||
.unwrap();
|
||||
let table = db.create_table("t", batch).execute().await?;
|
||||
|
||||
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
|
||||
assert!(uses_stable_row_ids(&table).await);
|
||||
assert_eq!(table.count_rows(None).await?, 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn add_coerces_large_binary_into_blob_column() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table =
|
||||
create_inline_blob_table(&db, "t", &[1, 2], &[Some(b"cat".as_slice()), Some(b"dog")])
|
||||
.await?;
|
||||
|
||||
assert_eq!(table.count_rows(None).await?, 2);
|
||||
let image = query_image_struct(&table).await;
|
||||
assert_eq!(image.len(), 2);
|
||||
// Table schema still has the blob marker after append.
|
||||
let schema = table.schema().await?;
|
||||
let field = schema.field_with_name("image").unwrap();
|
||||
assert_eq!(
|
||||
field
|
||||
.metadata()
|
||||
.get("ARROW:extension:name")
|
||||
.map(String::as_str),
|
||||
Some("lance.blob.v2")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn add_coerces_binary_into_blob_column() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table = db
|
||||
.create_empty_table("t", blob_table_schema())
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
Field::new("image", DataType::Binary, true),
|
||||
])),
|
||||
vec![
|
||||
Arc::new(Int64Array::from(vec![1])),
|
||||
Arc::new(BinaryArray::from_iter_values([b"small".as_slice()])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
table.add(batch).execute().await?;
|
||||
|
||||
assert_eq!(table.count_rows(None).await?, 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn add_accepts_null_blob_rows() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table = create_inline_blob_table(
|
||||
&db,
|
||||
"t",
|
||||
&[1, 2, 3],
|
||||
&[Some(b"first".as_slice()), None, Some(b"third")],
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(table.count_rows(None).await?, 3);
|
||||
let image = query_image_struct(&table).await;
|
||||
assert_eq!(image.len(), 3);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn add_rejects_uncoercible_blob_input() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table = db
|
||||
.create_empty_table("t", blob_table_schema())
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
Field::new("image", DataType::Utf8, true),
|
||||
])),
|
||||
vec![
|
||||
Arc::new(Int64Array::from(vec![1])),
|
||||
Arc::new(arrow_array::StringArray::from(vec!["not bytes"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let err = table.add(batch).execute().await.unwrap_err();
|
||||
assert!(err.to_string().contains("image"), "got: {err}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connection_level_stable_row_id_setting_wins_over_blob_default() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap())
|
||||
.storage_option(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, "false")
|
||||
.execute()
|
||||
.await?;
|
||||
let table = db
|
||||
.create_empty_table("t", blob_table_schema())
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
|
||||
assert!(!uses_stable_row_ids(&table).await);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn namespace_create_applies_blob_defaults() -> Result<()> {
|
||||
let tmp = tempdir().unwrap();
|
||||
let mut properties = std::collections::HashMap::new();
|
||||
properties.insert("root".to_string(), tmp.path().to_str().unwrap().to_string());
|
||||
let db = lancedb::connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await?;
|
||||
let table = db
|
||||
.create_empty_table("t", blob_table_schema())
|
||||
.execute()
|
||||
.await?;
|
||||
|
||||
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
|
||||
assert!(uses_stable_row_ids(&table).await);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Overwrite takes the input schema as-is (same as cast skip). Raw binary
|
||||
// overwrite drops the blob marker unless the input declares blob v2.
|
||||
#[tokio::test]
|
||||
async fn overwrite_replaces_blob_schema_with_input_schema() -> Result<()> {
|
||||
use lancedb::table::AddDataMode;
|
||||
|
||||
let tmp = tempdir().unwrap();
|
||||
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
|
||||
let table = create_inline_blob_table(&db, "t", &[1], &[Some(b"blob".as_slice())]).await?;
|
||||
|
||||
// Raw binary overwrite. Plain LargeBinary replaces the blob declaration.
|
||||
let raw_schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
Field::new("image", DataType::LargeBinary, true),
|
||||
]));
|
||||
let raw_batch = RecordBatch::try_new(
|
||||
raw_schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int64Array::from(vec![2])),
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"plain".as_slice()])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
table
|
||||
.add(raw_batch)
|
||||
.mode(AddDataMode::Overwrite)
|
||||
.execute()
|
||||
.await?;
|
||||
let schema = table.schema().await?;
|
||||
assert_eq!(schema, raw_schema);
|
||||
assert!(
|
||||
!schema
|
||||
.field_with_name("image")
|
||||
.unwrap()
|
||||
.metadata()
|
||||
.contains_key("ARROW:extension:name"),
|
||||
"raw binary overwrite leaves a plain binary column"
|
||||
);
|
||||
|
||||
// Overwrite with a declared blob struct keeps the blob column.
|
||||
let blob_field = blob("image", true);
|
||||
let DataType::Struct(children) = blob_field.data_type().clone() else {
|
||||
unreachable!("blob field is a struct")
|
||||
};
|
||||
let image = StructArray::new(
|
||||
children,
|
||||
vec![
|
||||
Arc::new(LargeBinaryArray::from_iter_values([b"declared".as_slice()])),
|
||||
Arc::new(arrow_array::StringArray::from(vec![None::<&str>])),
|
||||
],
|
||||
None,
|
||||
);
|
||||
let declared_batch = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int64, false),
|
||||
blob_field,
|
||||
])),
|
||||
vec![Arc::new(Int64Array::from(vec![3])), Arc::new(image)],
|
||||
)
|
||||
.unwrap();
|
||||
table
|
||||
.add(declared_batch)
|
||||
.mode(AddDataMode::Overwrite)
|
||||
.execute()
|
||||
.await?;
|
||||
let schema = table.schema().await?;
|
||||
assert_eq!(
|
||||
schema
|
||||
.field_with_name("image")
|
||||
.unwrap()
|
||||
.metadata()
|
||||
.get("ARROW:extension:name")
|
||||
.map(String::as_str),
|
||||
Some("lance.blob.v2")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user