Compare commits

..

2 Commits

Author SHA1 Message Date
Yang Cen
7ff72022dd feat(query): add approx mode to vector queries (#3549)
## Feature

### What is the new feature?

Adds Rust core API support for configuring vector query approximation
mode with `ApproxMode::{Fast, Normal, Accurate}`.

### Why do we need this feature?

Lance already exposes `lance_index::vector::ApproxMode` and scanner
support for controlling the speed/accuracy tradeoff for approximate
vector search. LanceDB Rust queries need to expose and pass this setting
through for local/native and remote vector searches.

### How does it work?

- Adds public `ApproxMode` in `rust/lancedb`, with lowercase serde,
`Default::Normal`, parse/display, and conversions to/from Lance's
`ApproxMode`.
- Adds `approx_mode: Option<ApproxMode>` to `VectorQueryRequest` and a
`VectorQuery::approx_mode(...)` builder.
- Applies the mode to native/local Lance scanners after `nearest(...)`
when explicitly set.
- Sends `approx_mode` in remote query JSON only when explicitly set;
default requests omit it.

## Validation

- `cargo fmt --all`
- `cargo test --quiet --features remote approx_mode`
- `cargo test --quiet --features remote
test_query_vector_default_values`
- `cargo check --quiet --features remote --tests --examples`
- `git diff --check`
2026-06-17 20:21:02 +08:00
lancedb automation
b2e0aa0588 chore: update lance dependency to v8.0.0-beta.16 2026-06-17 01:41:42 +00:00
48 changed files with 152 additions and 2004 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.31.0-beta.1"
current_version = "0.30.1-beta.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
@@ -23,8 +23,6 @@ allow_dirty = true
commit = true
message = "Bump version: {current_version} → {new_version}"
commit_args = ""
# bump-my-version >=1.4.0 rejects pre_commit_hooks containing shell syntax unless opted in.
allow_shell_hooks = true
# Java maven files
pre_commit_hooks = [

96
Cargo.lock generated
View File

@@ -3432,8 +3432,8 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"rand 0.9.4",
@@ -4735,8 +4735,8 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a"
[[package]]
name = "lance"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arc-swap",
"arrow",
@@ -4810,8 +4810,8 @@ dependencies = [
[[package]]
name = "lance-arrow"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4832,7 +4832,7 @@ dependencies = [
[[package]]
name = "lance-arrow-scalar"
version = "58.0.0"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4846,7 +4846,7 @@ dependencies = [
[[package]]
name = "lance-arrow-stats"
version = "58.0.0"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -4855,8 +4855,8 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrayref",
"paste",
@@ -4865,8 +4865,8 @@ dependencies = [
[[package]]
name = "lance-core"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -4904,8 +4904,8 @@ dependencies = [
[[package]]
name = "lance-datafusion"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow",
"arrow-array",
@@ -4935,8 +4935,8 @@ dependencies = [
[[package]]
name = "lance-datagen"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow",
"arrow-array",
@@ -4953,8 +4953,8 @@ dependencies = [
[[package]]
name = "lance-derive"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"proc-macro2",
"quote",
@@ -4963,8 +4963,8 @@ dependencies = [
[[package]]
name = "lance-encoding"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -4999,8 +4999,8 @@ dependencies = [
[[package]]
name = "lance-file"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -5030,8 +5030,8 @@ dependencies = [
[[package]]
name = "lance-index"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arc-swap",
"arrow",
@@ -5096,8 +5096,8 @@ dependencies = [
[[package]]
name = "lance-io"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow",
"arrow-arith",
@@ -5138,8 +5138,8 @@ dependencies = [
[[package]]
name = "lance-linalg"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -5154,8 +5154,8 @@ dependencies = [
[[package]]
name = "lance-namespace"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow",
"async-trait",
@@ -5167,8 +5167,8 @@ dependencies = [
[[package]]
name = "lance-namespace-impls"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow",
"arrow-ipc",
@@ -5222,8 +5222,8 @@ dependencies = [
[[package]]
name = "lance-select"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -5238,8 +5238,8 @@ dependencies = [
[[package]]
name = "lance-table"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow",
"arrow-array",
@@ -5278,8 +5278,8 @@ dependencies = [
[[package]]
name = "lance-testing"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"arrow-array",
"arrow-schema",
@@ -5292,21 +5292,20 @@ dependencies = [
[[package]]
name = "lance-tokenizer"
version = "8.0.0-rc.1"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-rc.1#eea4095b188bf2ba2fa95d934a2f5d6c2c9e661c"
version = "8.0.0-beta.16"
source = "git+https://github.com/lance-format/lance.git?tag=v8.0.0-beta.16#6e734df607f2841fe3bba82f05a90f3174933bab"
dependencies = [
"icu_segmenter",
"jieba-rs",
"lindera",
"rust-stemmers",
"serde",
"stop-words",
"unicode-normalization",
]
[[package]]
name = "lancedb"
version = "0.31.0-beta.1"
version = "0.30.1-beta.2"
dependencies = [
"ahash",
"anyhow",
@@ -5389,7 +5388,7 @@ dependencies = [
[[package]]
name = "lancedb-nodejs"
version = "0.31.0-beta.1"
version = "0.30.1-beta.2"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -5414,7 +5413,7 @@ dependencies = [
[[package]]
name = "lancedb-python"
version = "0.34.0-beta.1"
version = "0.33.1-beta.2"
dependencies = [
"arrow",
"async-trait",
@@ -9206,15 +9205,6 @@ version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978"
[[package]]
name = "stop-words"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d"
dependencies = [
"serde_json",
]
[[package]]
name = "str_stack"
version = "0.1.1"

View File

@@ -13,20 +13,20 @@ categories = ["database-implementations"]
rust-version = "1.91.0"
[workspace.dependencies]
lance = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=8.0.0-rc.1", default-features = false, "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=8.0.0-rc.1", "tag" = "v8.0.0-rc.1", "git" = "https://github.com/lance-format/lance.git" }
lance = { "version" = "=8.0.0-beta.16", default-features = false, "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-core = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-datagen = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-file = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-io = { "version" = "=8.0.0-beta.16", default-features = false, "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-index = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-linalg = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-namespace-impls = { "version" = "=8.0.0-beta.16", default-features = false, "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-table = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-testing = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-datafusion = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-encoding = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
lance-arrow = { "version" = "=8.0.0-beta.16", "tag" = "v8.0.0-beta.16", "git" = "https://github.com/lance-format/lance.git" }
ahash = "0.8"
# Note that this one does not include pyarrow
arrow = { version = "58.0.0", optional = false }

View File

@@ -14,7 +14,7 @@ Add the following dependency to your `pom.xml`:
<dependency>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-core</artifactId>
<version>0.31.0-beta.1</version>
<version>0.30.1-beta.2</version>
</dependency>
```

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.31.0-beta.1</version>
<version>0.30.1-beta.2</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.31.0-beta.1</version>
<version>0.30.1-beta.2</version>
<packaging>pom</packaging>
<name>${project.artifactId}</name>
<description>LanceDB Java SDK Parent POM</description>
@@ -28,7 +28,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<arrow.version>15.0.0</arrow.version>
<lance-core.version>8.0.0-rc.1</lance-core.version>
<lance-core.version>8.0.0-beta.16</lance-core.version>
<spotless.skip>false</spotless.skip>
<spotless.version>2.30.0</spotless.version>
<spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.31.0-beta.1"
version = "0.30.1-beta.2"
publish = false
license.workspace = true
description.workspace = true

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -1,12 +1,12 @@
{
"name": "@lancedb/lancedb",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@lancedb/lancedb",
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"cpu": [
"x64",
"arm64"

View File

@@ -11,7 +11,7 @@
"ann"
],
"private": false,
"version": "0.31.0-beta.1",
"version": "0.30.1-beta.2",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.34.0-beta.1"
current_version = "0.33.1-beta.2"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
@@ -23,8 +23,6 @@ allow_dirty = true
commit = true
message = "Bump version: {current_version} → {new_version}"
commit_args = ""
# bump-my-version >=1.4.0 rejects pre_commit_hooks containing shell syntax unless opted in.
allow_shell_hooks = true
# Update Cargo.lock after version bump
pre_commit_hooks = [

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.34.0-beta.1"
version = "0.33.1-beta.2"
publish = false
edition.workspace = true
description = "Python bindings for LanceDB"

View File

@@ -71,9 +71,6 @@ from lancedb.embeddings import EmbeddingFunctionConfig
from ._lancedb import Session
_MAX_QUERY_K = 2**31 - 1
def _query_to_namespace_request(
table_id: List[str],
query: "Query",
@@ -151,8 +148,7 @@ def _query_to_namespace_request(
if query.limit is not None:
k = query.limit
elif query.vector is None and query.full_text_query is None:
# limit k to max i32 value to avoid client overflows
k = _MAX_QUERY_K
k = sys.maxsize
else:
k = 10

View File

@@ -275,18 +275,7 @@ def _py_type_to_arrow_type(py_type: Type[Any], field: FieldInfo) -> pa.DataType:
tz = get_extras(field, "tz")
return pa.timestamp("us", tz=tz)
elif getattr(py_type, "__origin__", None) in (list, tuple):
# A bare, unparameterised ``typing.List`` / ``typing.Tuple`` matches this
# branch (its ``__origin__`` is ``list`` / ``tuple``) but has no
# ``__args__``, so we cannot infer the element type. Raise a clear
# ``TypeError`` instead of crashing with an opaque ``AttributeError``.
args = getattr(py_type, "__args__", None)
if not args:
raise TypeError(
"Converting Pydantic type to Arrow Type: unsupported type "
f"{py_type}. Specify the element type, e.g. List[int] instead "
"of a bare List."
)
child = args[0]
child = py_type.__args__[0]
return _pydantic_list_child_to_arrow(child, field)
raise TypeError(
f"Converting Pydantic type to Arrow Type: unsupported type {py_type}."

View File

@@ -86,10 +86,7 @@ def _from_list(data: list) -> Scannable:
@to_scannable.register(dict)
def _from_dict(data: dict) -> Scannable:
raise ValueError(
"Cannot create or add rows from a single dictionary. "
"Use a list of dictionaries instead."
)
raise ValueError("Cannot add a single dictionary to a table. Use a list.")
@to_scannable.register(LanceModel)

View File

@@ -243,10 +243,7 @@ def _into_pyarrow_reader(
raise ValueError("Cannot add a single LanceModel to a table. Use a list.")
if isinstance(data, dict):
raise ValueError(
"Cannot create or add rows from a single dictionary. "
"Use a list of dictionaries instead."
)
raise ValueError("Cannot add a single dictionary to a table. Use a list.")
if isinstance(data, list):
# Handle empty list case

View File

@@ -373,15 +373,9 @@ def _(value: list):
@value_to_sql.register(dict)
def _(value: dict):
# https://datafusion.apache.org/user-guide/sql/scalar_functions.html#named-struct
# Render the field name through value_to_sql(str(...)) as well so that keys
# containing characters meaningful in SQL (e.g. a single quote) are escaped
# the same way string values are. A bare f"'{k}'" would emit invalid SQL for
# a key like "it's".
return (
"named_struct("
+ ", ".join(
f"{value_to_sql(str(k))}, {value_to_sql(v)}" for k, v in value.items()
)
+ ", ".join(f"'{k}', {value_to_sql(v)}" for k, v in value.items())
+ ")"
)

View File

@@ -91,9 +91,7 @@ async def test_create_scalar_index(some_table: AsyncTable):
# Can recreate if replace=True
await some_table.create_index("id", replace=True)
indices = await some_table.list_indices()
assert str(indices).startswith(
'[IndexConfig(name="id_idx", index_type="BTree", columns=["id"]'
)
assert str(indices) == '[Index(BTree, columns=["id"], name="id_idx")]'
assert len(indices) == 1
assert indices[0].index_type == "BTree"
assert indices[0].columns == ["id"]
@@ -108,27 +106,6 @@ async def test_create_scalar_index(some_table: AsyncTable):
assert len(indices) == 0
@pytest.mark.asyncio
async def test_index_config_repr(db_async):
# Use >= 1000 rows so the thousands separator in the repr is exercised.
nrows = 1500
table = await db_async.create_table(
"repr_table", pa.Table.from_pydict({"id": list(range(nrows))})
)
await table.create_index("id", config=BTree())
indices = await table.list_indices()
assert len(indices) == 1
r = repr(indices[0])
assert r.startswith('IndexConfig(name="id_idx", index_type="BTree", columns=["id"]')
# Integer counts use `_` thousands separators (valid Python int syntax).
assert "num_indexed_rows=1_500" in r
assert "num_unindexed_rows=0" in r
# created_at renders as a datetime so the value round-trips.
assert "created_at=datetime.datetime(" in r
assert r.endswith(")")
@pytest.mark.asyncio
async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
metadata_type = pa.struct(
@@ -221,9 +198,7 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
async def test_create_fixed_size_binary_index(some_table: AsyncTable):
await some_table.create_index("fsb", config=BTree())
indices = await some_table.list_indices()
assert str(indices).startswith(
'[IndexConfig(name="fsb_idx", index_type="BTree", columns=["fsb"]'
)
assert str(indices) == '[Index(BTree, columns=["fsb"], name="fsb_idx")]'
assert len(indices) == 1
assert indices[0].index_type == "BTree"
assert indices[0].columns == ["fsb"]
@@ -272,9 +247,7 @@ async def test_create_bitmap_index(some_table: AsyncTable):
async def test_create_label_list_index(some_table: AsyncTable):
await some_table.create_index("tags", config=LabelList())
indices = await some_table.list_indices()
assert str(indices).startswith(
'[IndexConfig(name="tags_idx", index_type="LabelList", columns=["tags"]'
)
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan()
assert "ScalarIndexQuery" in plan
@@ -289,9 +262,7 @@ async def test_create_large_list_label_list_index(db_async):
await table.create_index("tags", config=LabelList())
indices = await table.list_indices()
assert str(indices).startswith(
'[IndexConfig(name="tags_idx", index_type="LabelList", columns=["tags"]'
)
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
plan = await table.query().where("array_has(tags, 'shared')").explain_plan()
assert "ScalarIndexQuery" in plan
@@ -328,9 +299,7 @@ async def test_create_label_list_index_rejects_list_struct(db_async):
async def test_full_text_search_index(some_table: AsyncTable):
await some_table.create_index("tags", config=FTS(with_position=False))
indices = await some_table.list_indices()
assert str(indices).startswith(
'[IndexConfig(name="tags_idx", index_type="FTS", columns=["tags"]'
)
assert str(indices) == '[Index(FTS, columns=["tags"], name="tags_idx")]'
await some_table.prewarm_index("tags_idx")

View File

@@ -5,11 +5,11 @@
import tempfile
import shutil
import sys
import pytest
import pyarrow as pa
import lancedb
from lance_namespace.errors import NamespaceNotEmptyError, TableNotFoundError
from lancedb.namespace import _MAX_QUERY_K
from lancedb.table import AsyncTable, LanceTable
@@ -816,13 +816,10 @@ class TestPushdownOperations:
["geneva", "hist"],
["geneva", "hist"],
]
# Unlimited reads cap k at i32::MAX (the namespace query_table `k`
# field is i32); sys.maxsize would overflow the Rust binding.
assert [request.k for request in namespace_client.requests] == [
_MAX_QUERY_K,
_MAX_QUERY_K,
sys.maxsize,
sys.maxsize,
]
assert all(r.k <= 2**31 - 1 for r in namespace_client.requests)
@pytest.mark.asyncio
@@ -877,13 +874,10 @@ class TestAsyncPushdownOperations:
["geneva", "hist"],
["geneva", "hist"],
]
# Unlimited reads cap k at i32::MAX (the namespace query_table `k`
# field is i32); sys.maxsize would overflow the Rust binding.
assert [request.k for request in namespace_client.requests] == [
_MAX_QUERY_K,
_MAX_QUERY_K,
sys.maxsize,
sys.maxsize,
]
assert all(r.k <= 2**31 - 1 for r in namespace_client.requests)
def test_local_table_to_arrow_and_to_pandas_are_unchanged(tmp_path):

View File

@@ -188,18 +188,6 @@ def test_nested_struct_list():
assert schema == expect_schema
def test_bare_generic_raises_type_error():
# A bare, unparameterised List/Tuple has no element type to map to Arrow.
# It should raise a clear TypeError, not crash with AttributeError: __args__.
for bare in (List, Tuple):
class TestModel(pydantic.BaseModel):
items: bare
with pytest.raises(TypeError, match="unsupported type"):
pydantic_to_schema(TestModel)
def test_nested_struct_list_optional():
class SplitInfo(pydantic.BaseModel):
start_frame: int

View File

@@ -301,16 +301,6 @@ def test_create_table(mem_db: DBConnection):
assert expected == tbl
def test_create_table_rejects_single_dictionary(mem_db: DBConnection):
data = {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}
with pytest.raises(ValueError) as excep_info:
mem_db.create_table("test", data=data)
assert (
str(excep_info.value) == "Cannot create or add rows from a single dictionary. "
"Use a list of dictionaries instead."
)
def test_empty_table(mem_db: DBConnection):
schema = pa.schema(
[
@@ -340,8 +330,8 @@ def test_add_dictionary(mem_db: DBConnection):
with pytest.raises(ValueError) as excep_info:
tbl.add(data=data)
assert (
str(excep_info.value) == "Cannot create or add rows from a single dictionary. "
"Use a list of dictionaries instead."
str(excep_info.value)
== "Cannot add a single dictionary to a table. Use a list."
)

View File

@@ -149,21 +149,6 @@ def test_value_to_sql_dict():
assert value_to_sql({}) == "named_struct()"
def test_value_to_sql_dict_key_escaping():
# Struct field names that contain a single quote must be escaped (doubled)
# the same way string values are, otherwise value_to_sql emits invalid SQL
# such as named_struct('it's', 1).
assert value_to_sql({"it's": 1}) == "named_struct('it''s', 1)"
assert (
value_to_sql({"o'brien": "d'angelo"}) == "named_struct('o''brien', 'd''angelo')"
)
# Escaping also applies to keys of nested structs.
assert (
value_to_sql({"outer": {"in'r": 1}})
== "named_struct('outer', named_struct('in''r', 1))"
)
def test_value_to_sql_numpy_scalars():
# numpy scalars (e.g. pulled from an ndarray or a pandas column) must
# convert the same way as their native Python counterparts. np.float64

View File

@@ -319,53 +319,11 @@ pub struct IndexConfig {
#[pymethods]
impl IndexConfig {
pub fn __repr__(&self, py: Python<'_>) -> String {
let mut fields = vec![
format!("name={:?}", self.name),
format!("index_type={:?}", self.index_type),
format!("columns={:?}", self.columns),
];
if let Some(v) = &self.index_uuid {
fields.push(format!("index_uuid={:?}", v));
}
if let Some(v) = &self.type_url {
fields.push(format!("type_url={:?}", v));
}
if let Some(v) = self.created_at {
// Render the datetime's own Python repr so the value round-trips,
// falling back to RFC 3339 if the conversion ever fails.
let rendered = v
.into_pyobject(py)
.ok()
.and_then(|obj| obj.into_any().repr().ok())
.map(|r| r.to_string())
.unwrap_or_else(|| v.to_rfc3339());
fields.push(format!("created_at={}", rendered));
}
if let Some(v) = self.num_indexed_rows {
fields.push(format!("num_indexed_rows={}", fmt_thousands(v)));
}
if let Some(v) = self.num_unindexed_rows {
fields.push(format!("num_unindexed_rows={}", fmt_thousands(v)));
}
if let Some(v) = self.size_bytes {
fields.push(format!("size_bytes={}", fmt_thousands(v)));
}
if let Some(v) = self.num_segments {
fields.push(format!("num_segments={}", v));
}
if let Some(v) = self.index_version {
fields.push(format!("index_version={}", v));
}
if let Some(v) = &self.index_details {
let details = v
.bind(py)
.repr()
.map(|r| r.to_string())
.unwrap_or_else(|_| "<unavailable>".to_string());
fields.push(format!("index_details={}", details));
}
format!("IndexConfig({})", fields.join(", "))
pub fn __repr__(&self) -> String {
format!(
"Index({}, columns={:?}, name=\"{}\")",
self.index_type, self.columns, self.name
)
}
// For backwards-compatibility with the old sync SDK, we also support getting
@@ -394,23 +352,6 @@ impl IndexConfig {
}
}
/// Format an integer with `_` thousands separators, e.g. `24_500_213`.
///
/// Underscores are valid Python int-literal syntax, so the repr stays
/// copy-pasteable and machine-parseable while remaining readable.
fn fmt_thousands(n: u64) -> String {
let digits = n.to_string();
let bytes = digits.as_bytes();
let mut out = String::with_capacity(digits.len() + digits.len() / 3);
for (i, b) in bytes.iter().enumerate() {
if i > 0 && (bytes.len() - i).is_multiple_of(3) {
out.push('_');
}
out.push(*b as char);
}
out
}
fn parse_index_details(py: Python<'_>, s: String) -> Py<PyAny> {
let json = py.import("json").expect("json module is always available");
match json.call_method1("loads", (s.as_str(),)) {

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.31.0-beta.1"
version = "0.30.1-beta.2"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -1,126 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Lance blob v2 columns store large binary payloads out of line.
//!
//! Declare a column with [`blob`]. On write, [`crate::table::Table::add`] coerces
//! raw `Binary` / `LargeBinary` into the blob struct layout. Queries return
//! small descriptors, not bytes.
//!
//! Blob tables require Lance file format >= 2.2 and stable row ids at create.
use arrow_schema::{Field, Schema};
use lance::dataset::WriteParams;
use lance_arrow::FieldExt;
use lance_encoding::version::LanceFileVersion;
/// Creates an Arrow field for a Lance blob v2 column.
///
/// `Struct<data, uri>` with the `lance.blob.v2` marker. Same layout Lance
/// expects on write.
///
/// ```
/// use arrow_schema::{DataType, Field, Schema};
///
/// let schema = Schema::new(vec![
/// Field::new("id", DataType::Int64, false),
/// lancedb::blob("image", true),
/// ]);
/// ```
///
/// Blob tables use Lance file format >= 2.2 and stable row ids at create.
pub fn blob(name: impl AsRef<str>, nullable: bool) -> Field {
lance::blob::blob_field(name.as_ref(), nullable)
}
/// Returns true if `schema` declares any blob v2 column.
pub(crate) fn has_blob_columns(schema: &Schema) -> bool {
schema.fields().iter().any(|field| field.is_blob_v2())
}
/// Bumps storage format to at least [`LanceFileVersion::V2_2`] for blob schemas.
pub(crate) fn ensure_blob_storage_version(schema: &Schema, params: &mut WriteParams) {
if !has_blob_columns(schema) {
return;
}
let resolved = params
.data_storage_version
.unwrap_or(LanceFileVersion::Stable)
.resolve();
if resolved < LanceFileVersion::V2_2 {
params.data_storage_version = Some(LanceFileVersion::V2_2);
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::DataType;
use lance_arrow::ARROW_EXT_NAME_KEY;
fn blob_schema() -> Schema {
Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob("image", true),
])
}
#[test]
fn blob_field_carries_v2_extension_marker() {
let field = blob("image", true);
assert_eq!(
field.metadata().get(ARROW_EXT_NAME_KEY).map(String::as_str),
Some("lance.blob.v2")
);
assert!(matches!(field.data_type(), DataType::Struct(_)));
}
#[test]
fn has_blob_columns_detects_blob_fields() {
assert!(has_blob_columns(&blob_schema()));
let plain = Schema::new(vec![Field::new("id", DataType::Int64, false)]);
assert!(!has_blob_columns(&plain));
}
#[test]
fn storage_version_bumps_to_v2_2() {
let mut params = WriteParams::default();
ensure_blob_storage_version(&blob_schema(), &mut params);
assert_eq!(
params.data_storage_version.unwrap().resolve(),
LanceFileVersion::V2_2
);
}
#[test]
fn storage_version_overrides_lower_explicit_version() {
let mut params = WriteParams {
data_storage_version: Some(LanceFileVersion::V2_0),
..Default::default()
};
ensure_blob_storage_version(&blob_schema(), &mut params);
assert_eq!(
params.data_storage_version.unwrap().resolve(),
LanceFileVersion::V2_2
);
}
#[test]
fn storage_version_keeps_higher_explicit_version() {
let mut params = WriteParams {
data_storage_version: Some(LanceFileVersion::V2_3),
..Default::default()
};
ensure_blob_storage_version(&blob_schema(), &mut params);
assert_eq!(params.data_storage_version.unwrap(), LanceFileVersion::V2_3);
}
#[test]
fn storage_version_noop_without_blob_columns() {
let schema = Schema::new(vec![Field::new("id", DataType::Int64, false)]);
let mut params = WriteParams::default();
ensure_blob_storage_version(&schema, &mut params);
assert!(params.data_storage_version.is_none());
}
}

View File

@@ -32,7 +32,6 @@ use crate::table::{BaseTable, WriteOptions};
pub mod listing;
pub mod namespace;
pub(crate) mod read_freshness;
pub trait DatabaseOptions {
fn serialize_into_map(&self, map: &mut HashMap<String, String>);

View File

@@ -18,7 +18,6 @@ use lance_table::io::commit::commit_handler_from_url;
use object_store::local::LocalFileSystem;
use snafu::ResultExt;
use crate::blob::{ensure_blob_storage_version, has_blob_columns};
use crate::connection::ConnectRequest;
use crate::database::ReadConsistency;
use crate::database::namespace::LanceNamespaceDatabase;
@@ -839,16 +838,13 @@ impl ListingDatabase {
write_params.enable_v2_manifest_paths = enable_v2_manifest_paths;
}
let data_schema = request.data.arrow_schema();
if let Some(enable_stable_row_ids) = stable_row_ids_override
.or(self.new_table_config.enable_stable_row_ids)
.or(has_blob_columns(&data_schema).then_some(true))
// Apply enable_stable_row_ids: table-level override takes precedence over connection config
if let Some(enable_stable_row_ids) =
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids)
{
write_params.enable_stable_row_ids = enable_stable_row_ids;
}
ensure_blob_storage_version(&data_schema, &mut write_params);
if matches!(&request.mode, CreateTableMode::Overwrite) {
write_params.mode = WriteMode::Overwrite;
}

View File

@@ -4,7 +4,7 @@
//! Namespace-based database implementation that delegates table management to lance-namespace
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, Mutex};
use std::sync::Arc;
use async_trait::async_trait;
use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore;
@@ -23,16 +23,12 @@ use lance_namespace_impls::ConnectBuilder;
use lance_table::io::commit::CommitHandler;
use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler;
use crate::blob::{ensure_blob_storage_version, has_blob_columns};
use crate::connection::NamespaceClientPushdownOperation;
use crate::database::ReadConsistency;
use crate::database::listing::{
NewTableConfig, OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, OPT_NEW_TABLE_STORAGE_VERSION,
OPT_NEW_TABLE_V2_MANIFEST_PATHS,
};
use crate::database::read_freshness::{
FreshnessBaselines, ReadFreshnessContextProvider, TableFreshness,
};
use crate::error::{Error, Result};
use crate::table::{NativeTable, map_namespace_lance_error};
use lance::dataset::WriteMode;
@@ -55,10 +51,6 @@ fn is_table_already_exists_namespace_error(err: &lance::Error) -> bool {
false
}
/// Object-id delimiter default (matches `RestNamespaceBuilder`'s); overridable
/// via the `delimiter` property.
const DEFAULT_NAMESPACE_DELIMITER: &str = "$";
/// A database implementation that uses lance-namespace for table management
pub struct LanceNamespaceDatabase {
namespace: Arc<dyn LanceNamespace>,
@@ -78,17 +70,6 @@ pub struct LanceNamespaceDatabase {
ns_properties: HashMap<String, String>,
// Options for tables created by this connection
new_table_config: NewTableConfig,
// Per-table read-freshness baselines, shared with the context provider.
freshness_baselines: FreshnessBaselines,
// Delimiter for building freshness keys; see `table_freshness`.
delimiter: String,
}
fn resolve_delimiter(ns_properties: &HashMap<String, String>) -> String {
ns_properties
.get("delimiter")
.cloned()
.unwrap_or_else(|| DEFAULT_NAMESPACE_DELIMITER.to_string())
}
impl LanceNamespaceDatabase {
@@ -101,9 +82,6 @@ impl LanceNamespaceDatabase {
session: Option<Arc<lance::session::Session>>,
namespace_client_pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
) -> Self {
// Client is pre-built, so we can't install the freshness provider here;
// baselines are still tracked for a uniform bump path.
let delimiter = resolve_delimiter(&namespace_client_properties);
Self {
namespace: namespace_client,
storage_options,
@@ -114,8 +92,6 @@ impl LanceNamespaceDatabase {
ns_impl: namespace_client_impl,
ns_properties: namespace_client_properties,
new_table_config: NewTableConfig::default(),
freshness_baselines: Arc::new(Mutex::new(HashMap::new())),
delimiter,
}
}
@@ -160,19 +136,10 @@ impl LanceNamespaceDatabase {
if let Some(ref sess) = session {
builder = builder.session(sess.clone());
}
// Install the read-freshness provider before building the client.
let freshness_baselines: FreshnessBaselines = Arc::new(Mutex::new(HashMap::new()));
builder = builder.context_provider(Arc::new(ReadFreshnessContextProvider::new(
freshness_baselines.clone(),
read_consistency_interval,
)));
let namespace = builder.connect().await.map_err(|e| Error::InvalidInput {
message: format!("Failed to connect to namespace: {:?}", e),
})?;
let delimiter = resolve_delimiter(&ns_properties);
Ok(Self {
namespace,
storage_options,
@@ -183,20 +150,9 @@ impl LanceNamespaceDatabase {
ns_impl: ns_impl.to_string(),
ns_properties,
new_table_config,
freshness_baselines,
delimiter,
})
}
/// Build a table's freshness handle, keyed to match the `object_id` the
/// namespace client sends on reads (table-id parts joined by the delimiter).
fn table_freshness(&self, namespace_path: &[String], name: &str) -> TableFreshness {
let mut parts = namespace_path.to_vec();
parts.push(name.to_string());
let key = parts.join(&self.delimiter);
TableFreshness::new(self.freshness_baselines.clone(), key)
}
fn extract_storage_overrides(
&self,
request: &DbCreateTableRequest,
@@ -258,16 +214,12 @@ impl LanceNamespaceDatabase {
params.enable_v2_manifest_paths = enable_v2_manifest_paths;
}
let data_schema = request.data.schema();
if let Some(enable_stable_row_ids) = stable_row_ids_override
.or(self.new_table_config.enable_stable_row_ids)
.or(has_blob_columns(data_schema.as_ref()).then_some(true))
if let Some(enable_stable_row_ids) =
stable_row_ids_override.or(self.new_table_config.enable_stable_row_ids)
{
params.enable_stable_row_ids = enable_stable_row_ids;
}
ensure_blob_storage_version(data_schema.as_ref(), params);
Ok(())
}
}
@@ -379,8 +331,7 @@ impl Database for LanceNamespaceDatabase {
self.pushdown_operations.clone(),
self.session.clone(),
)
.await?
.with_freshness(self.table_freshness(&request.namespace_path, &request.name));
.await?;
return Ok(Arc::new(native_table));
}
@@ -511,8 +462,7 @@ impl Database for LanceNamespaceDatabase {
self.pushdown_operations.clone(),
self.session.clone(),
)
.await?
.with_freshness(self.table_freshness(&request.namespace_path, &request.name));
.await?;
Ok(Arc::new(native_table))
}
@@ -528,8 +478,7 @@ impl Database for LanceNamespaceDatabase {
self.pushdown_operations.clone(),
self.session.clone(),
)
.await?
.with_freshness(self.table_freshness(&request.namespace_path, &request.name));
.await?;
Ok(Arc::new(native_table))
}

View File

@@ -1,312 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Read-freshness signaling for the lance-namespace path.
//!
//! Against a server that serves cached table metadata up to some staleness
//! window, a handle that just wrote (or asked for the latest version via
//! `checkout_latest`) can still read a stale snapshot. To prevent that, reads
//! routed through the namespace client carry an `x-lancedb-min-timestamp`
//! header naming the oldest snapshot the caller will accept.
//!
//! This mirrors `remote::table`: a per-table baseline is bumped to "now" on
//! every write and on `checkout_latest()`, and reads send
//! `max(baseline, now - read_consistency_interval)`. Since the namespace client
//! takes no headers directly, a [`DynamicContextProvider`] injects it per request.
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime};
use lance_namespace_impls::{DynamicContextProvider, OperationInfo};
/// Provider context keys prefixed with `headers.` become HTTP headers (prefix
/// stripped), so this emits the `x-lancedb-min-timestamp` header.
const MIN_TIMESTAMP_CONTEXT_KEY: &str = "headers.x-lancedb-min-timestamp";
/// Per-table freshness baselines (keyed by namespace object id), shared between
/// the provider that reads them and the table handles that bump them.
pub type FreshnessBaselines = Arc<Mutex<HashMap<String, SystemTime>>>;
/// `max(baseline, now - interval)`, or `None` when neither constraint applies.
fn compute_min_timestamp(
baseline: Option<SystemTime>,
interval: Option<Duration>,
now: SystemTime,
) -> Option<SystemTime> {
let interval_based = match interval {
None => None,
Some(d) if d.is_zero() => Some(now),
Some(d) => Some(now.checked_sub(d).unwrap_or(now)),
};
match (interval_based, baseline) {
(None, None) => None,
(Some(t), None) | (None, Some(t)) => Some(t),
(Some(a), Some(b)) => Some(a.max(b)),
}
}
/// Advance the baseline to `now`, never backwards, so a concurrent handle's
/// write can't lower a floor another handle already set.
fn next_freshness_baseline(prev: Option<SystemTime>, now: SystemTime) -> SystemTime {
match prev {
Some(p) => p.max(now),
None => now,
}
}
/// A handle's view of the shared baseline map for a single table.
#[derive(Clone, Debug)]
pub struct TableFreshness {
baselines: FreshnessBaselines,
/// Namespace object id for this table (matches the read's `object_id`).
key: String,
}
impl TableFreshness {
pub fn new(baselines: FreshnessBaselines, key: String) -> Self {
Self { baselines, key }
}
pub fn bump(&self) {
let now = SystemTime::now();
let mut baselines = self.baselines.lock().unwrap();
let prev = baselines.get(&self.key).copied();
baselines.insert(self.key.clone(), next_freshness_baseline(prev, now));
}
}
/// Read ops that can be served stale and so carry the freshness floor.
/// `list_table_versions` resolves "latest" for managed-versioning tables, so it
/// is what makes `checkout_latest()` observe a prior write.
fn is_read_operation(operation: &str) -> bool {
matches!(
operation,
"describe_table" | "list_table_versions" | "query_table" | "list_tables"
)
}
/// Injects `x-lancedb-min-timestamp` on namespace reads, per addressed table.
#[derive(Debug)]
pub struct ReadFreshnessContextProvider {
baselines: FreshnessBaselines,
read_consistency_interval: Option<Duration>,
}
impl ReadFreshnessContextProvider {
pub fn new(baselines: FreshnessBaselines, read_consistency_interval: Option<Duration>) -> Self {
Self {
baselines,
read_consistency_interval,
}
}
}
impl DynamicContextProvider for ReadFreshnessContextProvider {
fn provide_context(&self, info: &OperationInfo) -> HashMap<String, String> {
if !is_read_operation(&info.operation) {
return HashMap::new();
}
let baseline = self.baselines.lock().unwrap().get(&info.object_id).copied();
match compute_min_timestamp(baseline, self.read_consistency_interval, SystemTime::now()) {
Some(ts) => {
let dt: chrono::DateTime<chrono::Utc> = ts.into();
HashMap::from([(MIN_TIMESTAMP_CONTEXT_KEY.to_string(), dt.to_rfc3339())])
}
None => HashMap::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
/// Allowed slop when comparing a header timestamp against a locally
/// captured wall-clock bound. Tests run fast enough that 1s is plenty.
const TOLERANCE: Duration = Duration::from_secs(1);
fn parse_header_ts(headers: &HashMap<String, String>) -> SystemTime {
let value = headers
.get(MIN_TIMESTAMP_CONTEXT_KEY)
.expect("expected min-timestamp context key");
chrono::DateTime::parse_from_rfc3339(value)
.unwrap()
.with_timezone(&chrono::Utc)
.into()
}
#[test]
fn test_compute_min_timestamp_combines_baseline_and_interval() {
let now = SystemTime::now();
let baseline = now - Duration::from_secs(60);
// No interval, no baseline -> no header.
assert_eq!(compute_min_timestamp(None, None, now), None);
// Baseline only -> baseline.
assert_eq!(
compute_min_timestamp(Some(baseline), None, now),
Some(baseline)
);
// ZERO interval, no baseline -> now (strong consistency).
assert_eq!(
compute_min_timestamp(None, Some(Duration::ZERO), now),
Some(now)
);
// Positive interval, no baseline -> now - interval.
assert_eq!(
compute_min_timestamp(None, Some(Duration::from_secs(10)), now),
Some(now - Duration::from_secs(10))
);
// Both: pick the more-recent (tighter) constraint.
// baseline = now-60, now-interval = now-10. now-10 is newer.
assert_eq!(
compute_min_timestamp(Some(baseline), Some(Duration::from_secs(10)), now),
Some(now - Duration::from_secs(10))
);
// Both, baseline newer: pick baseline.
let recent_baseline = now - Duration::from_secs(5);
assert_eq!(
compute_min_timestamp(Some(recent_baseline), Some(Duration::from_secs(60)), now),
Some(recent_baseline)
);
}
#[test]
fn test_next_freshness_baseline_is_monotonic() {
let now = SystemTime::now();
let earlier = now - Duration::from_secs(30);
let later = now + Duration::from_secs(30);
// No prior baseline -> now.
assert_eq!(next_freshness_baseline(None, now), now);
// Prior baseline older than now -> now.
assert_eq!(next_freshness_baseline(Some(earlier), now), now);
// Prior baseline newer than now -> keep the newer baseline.
assert_eq!(next_freshness_baseline(Some(later), now), later);
}
fn provider_with(
entries: &[(&str, SystemTime)],
interval: Option<Duration>,
) -> ReadFreshnessContextProvider {
let map: HashMap<String, SystemTime> =
entries.iter().map(|(k, v)| (k.to_string(), *v)).collect();
ReadFreshnessContextProvider::new(Arc::new(Mutex::new(map)), interval)
}
#[test]
fn test_provider_emits_header_at_or_after_bumped_baseline() {
// A baseline set "now" with no interval: every read op must carry a
// floor at or after that baseline. `list_table_versions` is the hook
// that makes managed-versioning `checkout_latest()` observe a write.
let baseline = SystemTime::now();
let provider = provider_with(&[("ns$tbl", baseline)], None);
// These ops are keyed by the table id, so they pick up the per-table
// baseline. (`list_tables` is keyed by the namespace, so it is covered
// separately by the interval-floor test.)
for op in ["describe_table", "list_table_versions", "query_table"] {
let ctx = provider.provide_context(&OperationInfo::new(op, "ns$tbl"));
let sent = parse_header_ts(&ctx);
assert!(
sent >= baseline - TOLERANCE && sent <= baseline + TOLERANCE,
"operation {op} should carry a floor at the bumped baseline"
);
}
}
#[test]
fn test_provider_list_tables_uses_interval_floor_not_table_baseline() {
// `list_tables` is addressed by the namespace id, which never matches a
// per-table baseline key, so a bumped table baseline must not leak onto
// it. With no interval it sends nothing; with one it sends now-interval.
let provider = provider_with(&[("ns$tbl", SystemTime::now())], None);
let ctx = provider.provide_context(&OperationInfo::new("list_tables", "ns"));
assert!(
ctx.is_empty(),
"list_tables must not inherit a per-table baseline"
);
let interval = Duration::from_secs(30);
let provider = provider_with(&[("ns$tbl", SystemTime::now())], Some(interval));
let before = SystemTime::now();
let ctx = provider.provide_context(&OperationInfo::new("list_tables", "ns"));
let after = SystemTime::now();
let sent = parse_header_ts(&ctx);
assert!(
sent >= before - interval - TOLERANCE && sent <= after - interval + TOLERANCE,
"list_tables should carry the interval floor"
);
}
#[test]
fn test_provider_no_header_for_empty_baseline_and_no_interval() {
// Manual consistency (no interval) on a table that was never bumped:
// no floor, so the server may serve from cache.
let provider = provider_with(&[], None);
let ctx = provider.provide_context(&OperationInfo::new("describe_table", "ns$tbl"));
assert!(ctx.is_empty());
}
#[test]
fn test_provider_interval_floor_applies_without_baseline() {
// With a consistency interval and no baseline, the floor is now-interval.
let interval = Duration::from_secs(30);
let provider = provider_with(&[], Some(interval));
let before = SystemTime::now();
let ctx = provider.provide_context(&OperationInfo::new("query_table", "ns$tbl"));
let after = SystemTime::now();
let sent = parse_header_ts(&ctx);
assert!(
sent >= before - interval - TOLERANCE && sent <= after - interval + TOLERANCE,
"expected floor at roughly now - interval"
);
}
#[test]
fn test_provider_non_read_ops_emit_nothing() {
// Even with a fresh baseline and a zero interval, a non-read operation
// (which establishes rather than consumes a baseline) sends no header.
let provider = provider_with(&[("ns$tbl", SystemTime::now())], Some(Duration::ZERO));
for op in [
"create_table",
"register_table",
"drop_table",
"rename_table",
// Pinned to an immutable version, so it cannot be served stale.
"describe_table_version",
] {
let ctx = provider.provide_context(&OperationInfo::new(op, "ns$tbl"));
assert!(
ctx.is_empty(),
"operation {op} must not send a freshness header"
);
}
}
#[test]
fn test_provider_uses_per_table_baseline() {
// The floor is looked up by object id, so an unrelated table's baseline
// does not leak onto another table's read.
let baseline = SystemTime::now();
let provider = provider_with(&[("ns$has_baseline", baseline)], None);
// The bumped table gets a header.
let hit =
provider.provide_context(&OperationInfo::new("describe_table", "ns$has_baseline"));
assert!(!hit.is_empty());
// A different table with no baseline (and no interval) gets nothing.
let miss = provider.provide_context(&OperationInfo::new("describe_table", "ns$other"));
assert!(miss.is_empty());
}
}

View File

@@ -13,7 +13,7 @@ use serde_json::{Value, json};
use super::EmbeddingFunction;
use crate::{Error, Result};
use tokio::runtime::{Handle, RuntimeFlavor};
use tokio::runtime::Handle;
use tokio::task::block_in_place;
#[derive(Debug)]
@@ -148,12 +148,6 @@ impl BedrockEmbeddingFunction {
_ => unreachable!(),
};
// Bedrock's SDK is async but this trait method is synchronous, so we
// bridge with `block_in_place` + `block_on`. That requires a
// multi-threaded Tokio runtime; return a typed error instead of
// panicking when no compatible runtime is available.
let handle = current_multi_thread_handle()?;
for text in texts {
let request_body = match self.model {
BedrockEmbeddingModel::TitanEmbedding => {
@@ -169,28 +163,24 @@ impl BedrockEmbeddingFunction {
}
};
// Serialize before entering the blocking section so a serialization
// failure surfaces as a typed error rather than an `unwrap` panic.
let body = serde_json::to_vec(&request_body).map_err(|e| Error::Runtime {
message: format!("Failed to serialize Bedrock request: {e}"),
})?;
let client = self.client.clone();
let model_id = self.model.model_id().to_string();
let request_body = request_body.clone();
let response = block_in_place(|| {
handle.block_on(async move {
let response = block_in_place(move || {
Handle::current().block_on(async move {
client
.invoke_model()
.model_id(model_id)
.body(aws_sdk_bedrockruntime::primitives::Blob::new(body))
.body(aws_sdk_bedrockruntime::primitives::Blob::new(
serde_json::to_vec(&request_body).unwrap(),
))
.send()
.await
.map_err(|e| Error::Runtime {
message: format!("Bedrock invoke_model request failed: {e}"),
})
.map_err(Box::new)
})
})?;
})
.unwrap();
let response_json: Value =
serde_json::from_slice(response.body.as_ref()).map_err(|e| Error::Runtime {
@@ -198,12 +188,22 @@ impl BedrockEmbeddingFunction {
})?;
let embedding = match self.model {
BedrockEmbeddingModel::TitanEmbedding => {
json_array_to_f32(&response_json["embedding"], "embedding")?
}
BedrockEmbeddingModel::CohereLarge => {
json_array_to_f32(&response_json["embeddings"][0], "embeddings")?
}
BedrockEmbeddingModel::TitanEmbedding => response_json["embedding"]
.as_array()
.ok_or_else(|| Error::Runtime {
message: "Missing embedding in response".to_string(),
})?
.iter()
.map(|v| v.as_f64().unwrap() as f32)
.collect::<Vec<f32>>(),
BedrockEmbeddingModel::CohereLarge => response_json["embeddings"][0]
.as_array()
.ok_or_else(|| Error::Runtime {
message: "Missing embeddings in response".to_string(),
})?
.iter()
.map(|v| v.as_f64().unwrap() as f32)
.collect::<Vec<f32>>(),
};
builder.append_slice(&embedding);
@@ -212,86 +212,3 @@ impl BedrockEmbeddingFunction {
Ok(builder.finish())
}
}
/// Returns a handle to the current multi-threaded Tokio runtime, or a typed
/// [`Error::Runtime`] when called outside a runtime or on the current-thread
/// runtime. This keeps the synchronous-over-async bridge in
/// [`BedrockEmbeddingFunction::compute_inner`] from panicking on runtime
/// configurations that cannot support `block_in_place`.
fn current_multi_thread_handle() -> Result<Handle> {
let handle = Handle::try_current().map_err(|e| Error::Runtime {
message: format!("Bedrock embedding must be called from within a Tokio runtime: {e}"),
})?;
if handle.runtime_flavor() == RuntimeFlavor::CurrentThread {
return Err(Error::Runtime {
message: "Bedrock embedding requires a multi-threaded Tokio runtime; the \
current-thread runtime cannot use `block_in_place`"
.to_string(),
});
}
Ok(handle)
}
/// Converts a JSON value expected to be an array of numbers into `Vec<f32>`.
///
/// Returns a typed [`Error::Runtime`] (rather than panicking) when the value is
/// not an array or contains a non-numeric element, so malformed provider
/// responses degrade gracefully.
fn json_array_to_f32(value: &Value, field: &str) -> Result<Vec<f32>> {
let arr = value.as_array().ok_or_else(|| Error::Runtime {
message: format!("Missing or non-array '{field}' field in Bedrock response"),
})?;
arr.iter()
.map(|v| {
v.as_f64().map(|f| f as f32).ok_or_else(|| Error::Runtime {
message: format!("Non-numeric value in Bedrock '{field}' embedding: {v}"),
})
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_array_to_f32_parses_numbers() {
let v = json!([1.0, 2, -3.5]);
let out = json_array_to_f32(&v, "embedding").unwrap();
assert_eq!(out, vec![1.0_f32, 2.0, -3.5]);
}
#[test]
fn json_array_to_f32_rejects_non_array() {
// Missing field indexes to `Value::Null`; a malformed payload should be
// a typed error, not a panic.
let v = json!({"unexpected": "shape"});
let err = json_array_to_f32(&v["embedding"], "embedding").unwrap_err();
assert!(matches!(err, Error::Runtime { .. }), "got {err:?}");
}
#[test]
fn json_array_to_f32_rejects_non_numeric_element() {
let v = json!([1.0, "not-a-number", 3.0]);
let err = json_array_to_f32(&v, "embedding").unwrap_err();
assert!(matches!(err, Error::Runtime { .. }), "got {err:?}");
}
#[test]
fn handle_errors_without_runtime() {
// No Tokio runtime in scope -> typed error instead of a panic.
let err = current_multi_thread_handle().unwrap_err();
assert!(matches!(err, Error::Runtime { .. }), "got {err:?}");
}
#[tokio::test(flavor = "current_thread")]
async fn handle_errors_on_current_thread_runtime() {
let err = current_multi_thread_handle().unwrap_err();
assert!(matches!(err, Error::Runtime { .. }), "got {err:?}");
}
#[tokio::test(flavor = "multi_thread")]
async fn handle_ok_on_multi_thread_runtime() {
current_multi_thread_handle().expect("multi-threaded runtime should be accepted");
}
}

View File

@@ -163,7 +163,6 @@
//! ```
pub mod arrow;
pub mod blob;
pub mod connection;
pub mod data;
pub mod database;
@@ -189,7 +188,6 @@ use std::{fmt::Display, str::FromStr};
use serde::{Deserialize, Serialize};
pub use blob::blob;
pub use connection::{ConnectNamespaceBuilder, Connection};
pub use error::{Error, Result};
use lance_index::vector::ApproxMode as LanceApproxMode;

View File

@@ -1352,35 +1352,6 @@ impl<S: HttpSend + 'static> RemoteTable<S> {
}
}
/// Deserialize an index's `created_at` field.
///
/// The server returns this as an RFC 3339 string (e.g. `"2026-06-18T21:37:36.637Z"`),
/// but older deployments sent a unix timestamp in milliseconds. Accept both so the
/// client works against any server version.
fn deserialize_created_at<'de, D>(
deserializer: D,
) -> std::result::Result<Option<DateTime<Utc>>, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde::de::Error as _;
#[derive(Deserialize)]
#[serde(untagged)]
enum CreatedAt {
Rfc3339(String),
Millis(i64),
}
match Option::<CreatedAt>::deserialize(deserializer)? {
None => Ok(None),
Some(CreatedAt::Rfc3339(s)) => DateTime::parse_from_rfc3339(&s)
.map(|dt| Some(dt.with_timezone(&Utc)))
.map_err(D::Error::custom),
Some(CreatedAt::Millis(ms)) => Ok(DateTime::from_timestamp_millis(ms)),
}
}
impl<S: HttpSend + 'static> RemoteTable<S> {
/// Parse the response from `/index/list/` into `IndexConfig` entries.
///
@@ -1409,7 +1380,7 @@ impl<S: HttpSend + 'static> RemoteTable<S> {
// Used as the sentinel to decide whether to skip the stats call.
index_type: Option<IndexType>,
index_uuid: Option<String>,
#[serde(default, deserialize_with = "deserialize_created_at")]
#[serde(default, with = "chrono::serde::ts_milliseconds_option")]
created_at: Option<DateTime<Utc>>,
num_indexed_rows: Option<u64>,
num_unindexed_rows: Option<u64>,
@@ -4707,7 +4678,7 @@ mod tests {
"num_segments": 2,
"index_version": 1,
"index_details": "{\"num_partitions\":16}",
"created_at": "2026-06-18T21:37:36.637Z",
"created_at": 1700000000000i64,
"type_url": "type.googleapis.com/lance.index.vector.IvfPq",
},
{
@@ -4757,10 +4728,7 @@ mod tests {
vec_idx.type_url,
Some("type.googleapis.com/lance.index.vector.IvfPq".to_string())
);
assert_eq!(
vec_idx.created_at,
Some("2026-06-18T21:37:36.637Z".parse::<DateTime<Utc>>().unwrap())
);
assert!(vec_idx.created_at.is_some());
let text_idx = &indices[1];
assert_eq!(text_idx.name, "text_idx");
@@ -4781,36 +4749,6 @@ mod tests {
assert_eq!(text_idx.created_at, None);
}
#[test]
fn test_deserialize_created_at() {
#[derive(Deserialize)]
struct Wrapper {
#[serde(default, deserialize_with = "deserialize_created_at")]
created_at: Option<DateTime<Utc>>,
}
// RFC 3339 string (current server format).
let w: Wrapper =
serde_json::from_str(r#"{"created_at": "2026-06-18T21:37:36.637Z"}"#).unwrap();
assert_eq!(
w.created_at,
Some("2026-06-18T21:37:36.637Z".parse::<DateTime<Utc>>().unwrap())
);
// Unix milliseconds (legacy server format).
let w: Wrapper = serde_json::from_str(r#"{"created_at": 1700000000000}"#).unwrap();
assert_eq!(w.created_at, DateTime::from_timestamp_millis(1700000000000));
// Null and missing both yield None.
let w: Wrapper = serde_json::from_str(r#"{"created_at": null}"#).unwrap();
assert_eq!(w.created_at, None);
let w: Wrapper = serde_json::from_str(r#"{}"#).unwrap();
assert_eq!(w.created_at, None);
// A malformed string is rejected rather than silently dropped to None.
assert!(serde_json::from_str::<Wrapper>(r#"{"created_at": "not-a-date"}"#).is_err());
}
#[tokio::test]
async fn test_list_versions() {
let table = Table::new_with_handler("my_table", |request| {

View File

@@ -43,7 +43,6 @@ use crate::connection::NamespaceClientPushdownOperation;
use crate::data::scannable::{PeekedScannable, Scannable, estimate_write_partitions};
use crate::database::Database;
use crate::database::read_freshness::TableFreshness;
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MemoryRegistry};
use crate::error::{Error, Result};
use crate::index::IndexStatistics;
@@ -1764,8 +1763,6 @@ pub struct NativeTable {
// Operations to push down to the namespace server.
// pub(crate) so query.rs can access the field for server-side query execution.
pub(crate) pushdown_operations: HashSet<NamespaceClientPushdownOperation>,
// Read-freshness baseline; `Some` only for namespace-backed tables.
freshness: Option<TableFreshness>,
}
impl std::fmt::Debug for NativeTable {
@@ -1926,7 +1923,6 @@ impl NativeTable {
read_consistency_interval,
namespace_client,
pushdown_operations,
freshness: None,
})
}
@@ -1938,12 +1934,6 @@ impl NativeTable {
self
}
/// Attach the read-freshness baseline handle (namespace connections only).
pub(crate) fn with_freshness(mut self, freshness: TableFreshness) -> Self {
self.freshness = Some(freshness);
self
}
/// Build a sibling `NativeTable` with the same identity but a different
/// (independent) dataset wrapper — used to hand out branch-scoped handles.
fn with_dataset(&self, dataset: dataset::DatasetConsistencyWrapper) -> Self {
@@ -1956,14 +1946,6 @@ impl NativeTable {
read_consistency_interval: self.read_consistency_interval,
namespace_client: self.namespace_client.clone(),
pushdown_operations: self.pushdown_operations.clone(),
freshness: self.freshness.clone(),
}
}
/// Bump the read-freshness baseline; no-op for non-namespace tables.
fn bump_freshness(&self) {
if let Some(freshness) = &self.freshness {
freshness.bump();
}
}
@@ -2063,7 +2045,6 @@ impl NativeTable {
read_consistency_interval,
namespace_client: stored_namespace_client,
pushdown_operations,
freshness: None,
})
}
@@ -2153,7 +2134,6 @@ impl NativeTable {
read_consistency_interval,
namespace_client,
pushdown_operations,
freshness: None,
})
}
@@ -2285,7 +2265,6 @@ impl NativeTable {
read_consistency_interval,
namespace_client: stored_namespace_client,
pushdown_operations,
freshness: None,
})
}
@@ -2445,8 +2424,6 @@ impl BaseTable for NativeTable {
}
async fn checkout_latest(&self) -> Result<()> {
// Bump before resolving "latest" so that request carries the floor.
self.bump_freshness();
self.dataset.as_latest().await?;
self.dataset.reload().await
}
@@ -2534,8 +2511,6 @@ impl BaseTable for NativeTable {
debug_assert_eq!(dataset.version().version, version);
dataset.restore().await?;
}
// Restore moves "latest", so bump before resolving it (as RemoteTable does).
self.bump_freshness();
self.dataset.as_latest().await?;
Ok(())
}
@@ -2616,13 +2591,7 @@ impl BaseTable for NativeTable {
output.plan
};
let insert_exec = Arc::new(InsertExec::new_with_tracker(
ds_wrapper.clone(),
ds,
plan,
lance_params,
output.tracker.clone(),
));
let insert_exec = Arc::new(InsertExec::new(ds_wrapper.clone(), ds, plan, lance_params));
let tracker_for_tasks = output.tracker.clone();
if let Some(ref t) = tracker_for_tasks {
@@ -2655,7 +2624,6 @@ impl BaseTable for NativeTable {
}
let version = ds_wrapper.get().await?.manifest().version;
self.bump_freshness();
Ok(AddResult { version })
}
@@ -2706,9 +2674,7 @@ impl BaseTable for NativeTable {
async fn update(&self, update: UpdateBuilder) -> Result<UpdateResult> {
// Delegate to the submodule implementation
let result = update::execute_update(self, update).await?;
self.bump_freshness();
Ok(result)
update::execute_update(self, update).await
}
async fn create_plan(
@@ -2740,9 +2706,7 @@ impl BaseTable for NativeTable {
params: MergeInsertBuilder,
new_data: Box<dyn RecordBatchReader + Send>,
) -> Result<MergeResult> {
let result = merge::execute_merge_insert(self, params, new_data).await?;
self.bump_freshness();
Ok(result)
merge::execute_merge_insert(self, params, new_data).await
}
async fn set_unenforced_primary_key(&self, columns: &[&str]) -> Result<()> {
@@ -2763,9 +2727,7 @@ impl BaseTable for NativeTable {
/// Delete rows from the table
async fn delete(&self, predicate: Predicate<'_>) -> Result<DeleteResult> {
let result = delete::execute_delete(self, predicate).await?;
self.bump_freshness();
Ok(result)
delete::execute_delete(self, predicate).await
}
async fn tags(&self) -> Result<Box<dyn Tags + '_>> {
@@ -2784,30 +2746,22 @@ impl BaseTable for NativeTable {
transforms: NewColumnTransform,
read_columns: Option<Vec<String>>,
) -> Result<AddColumnsResult> {
let result = schema_evolution::execute_add_columns(self, transforms, read_columns).await?;
self.bump_freshness();
Ok(result)
schema_evolution::execute_add_columns(self, transforms, read_columns).await
}
async fn alter_columns(&self, alterations: &[ColumnAlteration]) -> Result<AlterColumnsResult> {
let result = schema_evolution::execute_alter_columns(self, alterations).await?;
self.bump_freshness();
Ok(result)
schema_evolution::execute_alter_columns(self, alterations).await
}
async fn update_field_metadata(
&self,
updates: &[FieldMetadataUpdate],
) -> Result<UpdateFieldMetadataResult> {
let result = schema_evolution::execute_update_field_metadata(self, updates).await?;
self.bump_freshness();
Ok(result)
schema_evolution::execute_update_field_metadata(self, updates).await
}
async fn drop_columns(&self, columns: &[&str]) -> Result<DropColumnsResult> {
let result = schema_evolution::execute_drop_columns(self, columns).await?;
self.bump_freshness();
Ok(result)
schema_evolution::execute_drop_columns(self, columns).await
}
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {

View File

@@ -26,9 +26,6 @@ pub enum AddDataMode {
#[default]
Append,
/// The existing table will be overwritten with the new data
///
/// On overwrite, raw binary is not coerced into a blob struct. The input
/// must declare blob v2 for the column to stay a blob column.
Overwrite,
}

View File

@@ -3,7 +3,6 @@
//! This module contains adapters to allow LanceDB tables to be used as DataFusion table providers.
mod blob_coerce;
pub mod cast;
pub mod insert;
pub mod reject_nan;

View File

@@ -1,495 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Coerces write-path input into blob v2 struct columns.
//!
//! [`super::cast::cast_to_table_schema`] calls [`coerce_blob_expr`].
use std::sync::Arc;
use arrow_schema::{DataType, Field, FieldRef};
use datafusion::functions::core::{get_field, named_struct};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_physical_expr::ScalarFunctionExpr;
use datafusion_physical_expr::expressions::{CastExpr, Literal};
use datafusion_physical_plan::PhysicalExpr;
use crate::error::{Error, Result};
/// Build a projection expression coercing `input_expr` into the blob struct
/// declared by `table_field`, composing `named_struct` / `get_field` / `cast`.
pub(super) fn coerce_blob_expr(
input_expr: Arc<dyn PhysicalExpr>,
input_field: &Field,
table_field: &FieldRef,
config: &Arc<ConfigOptions>,
) -> Result<(Arc<dyn PhysicalExpr>, FieldRef)> {
let DataType::Struct(declared_fields) = table_field.data_type() else {
return Err(Error::InvalidInput {
message: format!(
"blob v2 column '{}' must be a struct, table declares {}",
table_field.name(),
table_field.data_type()
),
});
};
let input_struct_children = match input_field.data_type() {
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => None,
DataType::Struct(children) => {
if !children
.iter()
.any(|c| c.name() == "data" || c.name() == "uri")
{
return Err(Error::InvalidInput {
message: format!(
"blob struct input for column '{}' must contain a 'data' or 'uri' child",
table_field.name()
),
});
}
Some(children)
}
other => {
return Err(Error::InvalidInput {
message: format!(
"cannot coerce column '{}' with type {} into a blob v2 struct. \
expected Binary, LargeBinary, BinaryView, or a Struct with a 'data' or 'uri' child",
table_field.name(),
other,
),
});
}
};
let mut ns_args: Vec<Arc<dyn PhysicalExpr>> = Vec::with_capacity(declared_fields.len() * 2);
for declared in declared_fields.iter() {
ns_args.push(Arc::new(Literal::new(ScalarValue::from(
declared.name().as_str(),
))));
let value: Arc<dyn PhysicalExpr> = match input_struct_children {
// Raw binary lands in `data` and everything else is a typed null.
None => {
if declared.name() == "data" {
Arc::new(CastExpr::new(
input_expr.clone(),
declared.data_type().clone(),
None,
))
} else {
typed_null(declared.data_type())?
}
}
Some(children) => match children.iter().find(|c| c.name() == declared.name()) {
Some(child) => {
let field_expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
&format!("get_field({})", declared.name()),
get_field(),
vec![
input_expr.clone(),
Arc::new(Literal::new(ScalarValue::from(declared.name().as_str()))),
],
Arc::new(child.as_ref().clone()),
config.clone(),
));
if child.data_type() == declared.data_type() {
field_expr
} else {
Arc::new(CastExpr::new(
field_expr,
declared.data_type().clone(),
None,
))
}
}
None => typed_null(declared.data_type())?,
},
};
ns_args.push(value);
}
let expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
&format!("named_struct({})", table_field.name()),
named_struct(),
ns_args,
table_field.clone(),
config.clone(),
));
Ok((expr, table_field.clone()))
}
fn typed_null(data_type: &DataType) -> Result<Arc<dyn PhysicalExpr>> {
let scalar = ScalarValue::try_from(data_type).map_err(|e| Error::InvalidInput {
message: format!("cannot build null literal for blob child type {data_type}: {e}"),
})?;
Ok(Arc::new(Literal::new(scalar)))
}
#[cfg(test)]
mod tests {
use super::super::cast::cast_to_table_schema;
use super::*;
use crate::blob::blob;
use arrow_array::{
Array, ArrayRef, BinaryArray, BinaryViewArray, Int32Array, Int64Array, LargeBinaryArray,
RecordBatch, StringArray, StructArray, UInt8Array, UInt64Array,
};
use arrow_schema::Schema;
use datafusion::prelude::SessionContext;
use datafusion_catalog::MemTable;
use datafusion_physical_plan::ExecutionPlan;
use futures::TryStreamExt;
use lance_arrow::FieldExt;
use std::collections::HashMap;
fn wide_blob_field(name: &str) -> Field {
Field::new(
name,
DataType::Struct(
vec![
Field::new("data", DataType::LargeBinary, true),
Field::new("uri", DataType::Utf8, true),
Field::new("position", DataType::UInt64, true),
Field::new("size", DataType::UInt64, true),
]
.into(),
),
true,
)
.with_metadata(HashMap::from([(
"ARROW:extension:name".to_string(),
"lance.blob.v2".to_string(),
)]))
}
fn blob_table_schema() -> Schema {
Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob("image", true),
])
}
fn batch_with_image(image_field: Field, image: ArrayRef) -> RecordBatch {
let len = image.len();
RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
image_field,
])),
vec![Arc::new(Int64Array::from_iter_values(0..len as i64)), image],
)
.unwrap()
}
fn image_struct(batch: &RecordBatch) -> &StructArray {
batch
.column_by_name("image")
.unwrap()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
}
async fn plan_from_batch(batch: RecordBatch) -> Arc<dyn ExecutionPlan> {
let schema = batch.schema();
let table = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
let ctx = SessionContext::new();
ctx.register_table("t", Arc::new(table)).unwrap();
let df = ctx.table("t").await.unwrap();
df.create_physical_plan().await.unwrap()
}
async fn coerce(batch: RecordBatch, table_schema: &Schema) -> RecordBatch {
let plan = plan_from_batch(batch).await;
let plan = cast_to_table_schema(plan, table_schema).unwrap();
let ctx = SessionContext::new();
let stream = plan.execute(0, ctx.task_ctx()).unwrap();
let batches: Vec<RecordBatch> = stream.try_collect().await.unwrap();
arrow_select::concat::concat_batches(&plan.schema(), &batches).unwrap()
}
async fn coerce_err(batch: RecordBatch, table_schema: &Schema) -> Error {
let plan = plan_from_batch(batch).await;
cast_to_table_schema(plan, table_schema).unwrap_err()
}
#[tokio::test]
async fn large_binary_coerces_to_declared_blob_struct() {
let batch = batch_with_image(
Field::new("image", DataType::LargeBinary, true),
Arc::new(LargeBinaryArray::from_iter_values([b"hello".as_slice()])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
let image_field = coerced.schema().field_with_name("image").unwrap().clone();
assert!(image_field.is_blob_v2());
assert!(matches!(image_field.data_type(), DataType::Struct(_)));
let data = image_struct(&coerced).column_by_name("data").unwrap();
let data: &LargeBinaryArray = data.as_any().downcast_ref().unwrap();
assert_eq!(data.value(0), b"hello");
}
#[tokio::test]
async fn binary_coerces_to_declared_blob_struct() {
let batch = batch_with_image(
Field::new("image", DataType::Binary, true),
Arc::new(BinaryArray::from_iter_values([b"hi".as_slice()])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
assert!(
coerced
.schema()
.field_with_name("image")
.unwrap()
.is_blob_v2()
);
}
#[tokio::test]
async fn binary_view_coerces_to_declared_blob_struct() {
let batch = batch_with_image(
Field::new("image", DataType::BinaryView, true),
Arc::new(BinaryViewArray::from_iter_values([b"view".as_slice()])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
let data = image_struct(&coerced).column_by_name("data").unwrap();
let data: &LargeBinaryArray = data.as_any().downcast_ref().unwrap();
assert_eq!(data.value(0), b"view");
}
#[tokio::test]
async fn binary_nulls_stay_null_after_coercion() {
let batch = batch_with_image(
Field::new("image", DataType::Binary, true),
Arc::new(BinaryArray::from_iter(vec![
Some(b"present".as_slice()),
None,
])),
);
let coerced = coerce(batch, &blob_table_schema()).await;
let image = image_struct(&coerced);
let data = image.column_by_name("data").unwrap();
assert!(!data.is_null(0));
assert!(data.is_null(1));
}
#[tokio::test]
async fn binary_coerces_into_four_child_blob_layout() {
let table_schema = Schema::new(vec![
Field::new("id", DataType::Int64, false),
wide_blob_field("image"),
]);
let batch = batch_with_image(
Field::new("image", DataType::LargeBinary, true),
Arc::new(LargeBinaryArray::from_iter(vec![
Some(b"alpha".as_slice()),
None,
])),
);
let coerced = coerce(batch, &table_schema).await;
let image = image_struct(&coerced);
assert_eq!(
image.num_columns(),
4,
"coerced struct keeps the declared layout"
);
assert!(image.column_by_name("position").unwrap().is_null(0));
assert!(image.column_by_name("size").unwrap().is_null(0));
assert!(!image.column_by_name("data").unwrap().is_null(0));
assert!(image.column_by_name("data").unwrap().is_null(1));
}
#[tokio::test]
async fn prebuilt_struct_gains_blob_field_metadata() {
let DataType::Struct(children) = blob("image", true).data_type().clone() else {
unreachable!("blob field is a struct")
};
let prebuilt = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"prebuilt".as_slice()])),
Arc::new(StringArray::from(vec![None::<&str>])),
],
None,
);
let batch = batch_with_image(
Field::new("image", prebuilt.data_type().clone(), true),
Arc::new(prebuilt),
);
let coerced = coerce(batch, &blob_table_schema()).await;
assert!(
coerced
.schema()
.field_with_name("image")
.unwrap()
.is_blob_v2()
);
}
#[tokio::test]
async fn prebuilt_narrow_struct_widens_to_declared_layout() {
let DataType::Struct(narrow_children) = blob("image", true).data_type().clone() else {
unreachable!("blob field is a struct")
};
let prebuilt = StructArray::new(
narrow_children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"prebuilt".as_slice()])),
Arc::new(StringArray::from(vec![None::<&str>])),
],
None,
);
let table_schema = Schema::new(vec![
Field::new("id", DataType::Int64, false),
wide_blob_field("image"),
]);
let batch = batch_with_image(
Field::new("image", prebuilt.data_type().clone(), true),
Arc::new(prebuilt),
);
let coerced = coerce(batch, &table_schema).await;
let image = image_struct(&coerced);
assert_eq!(image.num_columns(), 4);
assert!(image.column_by_name("position").unwrap().is_null(0));
assert!(image.column_by_name("size").unwrap().is_null(0));
}
#[tokio::test]
async fn external_reference_struct_preserves_uri_position_and_size() {
let prebuilt = StructArray::new(
vec![
Field::new("data", DataType::LargeBinary, true),
Field::new("uri", DataType::Utf8, true),
Field::new("position", DataType::UInt64, true),
Field::new("size", DataType::UInt64, true),
]
.into(),
vec![
Arc::new(LargeBinaryArray::from(vec![None::<&[u8]>])) as ArrayRef,
Arc::new(StringArray::from(vec![Some("s3://bucket/blob.bin")])) as ArrayRef,
Arc::new(UInt64Array::from(vec![Some(7)])) as ArrayRef,
Arc::new(UInt64Array::from(vec![Some(6)])) as ArrayRef,
],
None,
);
let table_schema = Schema::new(vec![
Field::new("id", DataType::Int64, false),
wide_blob_field("image"),
]);
let batch = batch_with_image(
Field::new("image", prebuilt.data_type().clone(), true),
Arc::new(prebuilt),
);
let coerced = coerce(batch, &table_schema).await;
let image = image_struct(&coerced);
let uri: &StringArray = image
.column_by_name("uri")
.unwrap()
.as_any()
.downcast_ref()
.unwrap();
assert_eq!(uri.value(0), "s3://bucket/blob.bin");
let position: &UInt64Array = image
.column_by_name("position")
.unwrap()
.as_any()
.downcast_ref()
.unwrap();
assert_eq!(position.value(0), 7);
let size: &UInt64Array = image
.column_by_name("size")
.unwrap()
.as_any()
.downcast_ref()
.unwrap();
assert_eq!(size.value(0), 6);
assert!(image.column_by_name("data").unwrap().is_null(0));
}
#[tokio::test]
async fn descriptor_struct_without_value_child_is_rejected() {
let descriptor = StructArray::new(
vec![
Field::new("kind", DataType::UInt8, false),
Field::new("position", DataType::UInt64, false),
Field::new("size", DataType::UInt64, false),
]
.into(),
vec![
Arc::new(UInt8Array::from(vec![0])),
Arc::new(UInt64Array::from(vec![0])),
Arc::new(UInt64Array::from(vec![0])),
],
None,
);
let batch = batch_with_image(
Field::new("image", descriptor.data_type().clone(), true),
Arc::new(descriptor),
);
let err = coerce_err(batch, &blob_table_schema()).await;
assert!(err.to_string().contains("'data' or 'uri'"));
assert!(err.to_string().contains("image"));
}
#[tokio::test]
async fn unsupported_input_type_is_rejected_with_column_name() {
let batch = batch_with_image(
Field::new("image", DataType::Utf8, true),
Arc::new(StringArray::from(vec!["not bytes"])),
);
let err = coerce_err(batch, &blob_table_schema()).await;
assert!(matches!(err, Error::InvalidInput { .. }), "got {err:?}");
assert!(err.to_string().contains("image"));
}
#[tokio::test]
async fn blob_metadata_survives_cast_of_sibling_column() {
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("image", DataType::LargeBinary, true),
])),
vec![
Arc::new(Int32Array::from(vec![1])),
Arc::new(LargeBinaryArray::from_iter_values([b"x".as_slice()])),
],
)
.unwrap();
let coerced = coerce(batch, &blob_table_schema()).await;
let image_field = coerced.schema().field_with_name("image").unwrap().clone();
assert!(
image_field.is_blob_v2(),
"expected blob marker on image field, got {:?}",
image_field.metadata()
);
assert_eq!(
coerced.schema().field_with_name("id").unwrap().data_type(),
&DataType::Int64
);
}
#[tokio::test]
async fn exact_blob_input_passes_through_unchanged() {
let DataType::Struct(children) = blob("image", true).data_type().clone() else {
unreachable!("blob field is a struct")
};
let image = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"exact".as_slice()])),
Arc::new(StringArray::from(vec![None::<&str>])),
],
None,
);
let batch = batch_with_image(blob("image", true), Arc::new(image));
let table_schema = blob_table_schema();
let input = plan_from_batch(batch).await;
let input_ptr = Arc::as_ptr(&input);
let plan = cast_to_table_schema(input, &table_schema).unwrap();
assert_eq!(Arc::as_ptr(&plan), input_ptr, "no projection inserted");
}
}

View File

@@ -13,10 +13,8 @@ use datafusion_physical_expr::expressions::{CastExpr, Literal};
use datafusion_physical_plan::expressions::Column;
use datafusion_physical_plan::projection::ProjectionExec;
use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr};
use lance_arrow::FieldExt;
use lance_arrow::json::{is_arrow_json_field, is_json_field};
use super::blob_coerce::coerce_blob_expr;
use crate::{Error, Result};
pub fn cast_to_table_schema(
@@ -79,17 +77,6 @@ fn build_field_exprs(
continue;
}
// Blob columns accept raw binary on write; exact matches pass through below.
if table_field.is_blob_v2() && input_field.as_ref() != table_field.as_ref() {
result.push(coerce_blob_expr(
input_expr,
input_field,
table_field,
&config,
)?);
continue;
}
let expr = match (input_field.data_type(), table_field.data_type()) {
// Both are structs: recurse into sub-fields to handle subschemas and casts.
(DataType::Struct(in_children), DataType::Struct(tbl_children))

View File

@@ -4,7 +4,6 @@
//! DataFusion ExecutionPlan for inserting data into LanceDB tables.
use std::any::Any;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, LazyLock, Mutex};
use arrow_array::{RecordBatch, UInt64Array};
@@ -21,12 +20,11 @@ use datafusion_physical_plan::{
use futures::TryStreamExt;
use lance::Dataset;
use lance::dataset::transaction::{Operation, Transaction};
use lance::dataset::{CommitBuilder, InsertBuilder, WriteParams, WriteProgressFn};
use lance::dataset::{CommitBuilder, InsertBuilder, WriteParams};
use lance::io::exec::utils::InstrumentedRecordBatchStreamAdapter;
use lance_table::format::Fragment;
use crate::table::dataset::DatasetConsistencyWrapper;
use crate::table::write_progress::WriteProgressTracker;
pub(crate) static COUNT_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| {
Arc::new(ArrowSchema::new(vec![Field::new(
@@ -83,7 +81,6 @@ pub struct InsertExec {
dataset: Arc<Dataset>,
input: Arc<dyn ExecutionPlan>,
write_params: WriteParams,
tracker: Option<Arc<WriteProgressTracker>>,
properties: Arc<PlanProperties>,
partial_transactions: Arc<Mutex<Vec<Transaction>>>,
metrics: ExecutionPlanMetricsSet,
@@ -95,16 +92,6 @@ impl InsertExec {
dataset: Arc<Dataset>,
input: Arc<dyn ExecutionPlan>,
write_params: WriteParams,
) -> Self {
Self::new_with_tracker(ds_wrapper, dataset, input, write_params, None)
}
pub(crate) fn new_with_tracker(
ds_wrapper: DatasetConsistencyWrapper,
dataset: Arc<Dataset>,
input: Arc<dyn ExecutionPlan>,
write_params: WriteParams,
tracker: Option<Arc<WriteProgressTracker>>,
) -> Self {
let schema = COUNT_SCHEMA.clone();
let num_partitions = input.output_partitioning().partition_count();
@@ -120,7 +107,6 @@ impl InsertExec {
dataset,
input,
write_params,
tracker,
properties: Arc::new(properties),
partial_transactions: Arc::new(Mutex::new(Vec::with_capacity(num_partitions))),
metrics: ExecutionPlanMetricsSet::new(),
@@ -175,12 +161,11 @@ impl ExecutionPlan for InsertExec {
"InsertExec requires exactly one child".to_string(),
));
}
Ok(Arc::new(Self::new_with_tracker(
Ok(Arc::new(Self::new(
self.ds_wrapper.clone(),
self.dataset.clone(),
children[0].clone(),
self.write_params.clone(),
self.tracker.clone(),
)))
}
@@ -191,11 +176,10 @@ impl ExecutionPlan for InsertExec {
) -> DataFusionResult<SendableRecordBatchStream> {
let input_stream = self.input.execute(partition, context)?;
let dataset = self.dataset.clone();
let mut write_params = self.write_params.clone();
let write_params = self.write_params.clone();
let partial_transactions = self.partial_transactions.clone();
let total_partitions = self.input.output_partitioning().partition_count();
let ds_wrapper = self.ds_wrapper.clone();
let tracker = self.tracker.clone();
let output_bytes = MetricBuilder::new(&self.metrics).output_bytes(partition);
let input_schema = input_stream.schema();
@@ -211,20 +195,6 @@ impl ExecutionPlan for InsertExec {
));
let stream = futures::stream::once(async move {
if let Some(tracker) = tracker
&& write_params.write_progress.is_none()
{
let last_bytes = Arc::new(AtomicU64::new(0));
write_params.write_progress = Some(WriteProgressFn::new(move |stats| {
let previous = last_bytes.swap(stats.bytes_written, Ordering::Relaxed);
if stats.bytes_written > previous {
let delta =
usize::try_from(stats.bytes_written - previous).unwrap_or(usize::MAX);
tracker.record_bytes(delta);
}
}));
}
let transaction = InsertBuilder::new(dataset.clone())
.with_params(&write_params)
.execute_uncommitted_stream(input_stream)

View File

@@ -518,10 +518,6 @@ mod tests {
let wrapper = DatasetConsistencyWrapper::new_latest(ds, Some(Duration::from_millis(200)));
// Freeze `cached_at` on the mock clock so a slow external write below can't
// expire the TTL before the explicit advance_by() does (flake on loaded CI).
clock::pin();
// Populate the cache
let v1 = wrapper.get().await.unwrap().version().version;
assert_eq!(v1, 1);

View File

@@ -142,21 +142,11 @@ impl WriteProgressTracker {
cb(&progress);
}
/// Record wire bytes from the insert layer.
///
/// These bytes may be IPC-encoded bytes for remote writes or bytes handed
/// to Lance's local writer. When wire bytes are recorded, they take
/// precedence over the in-memory Arrow bytes tracked by [`record_batch`].
/// Record wire bytes from the insert layer (e.g. IPC-encoded bytes for
/// remote writes). When wire bytes are recorded, they take precedence over
/// the in-memory Arrow bytes tracked by [`record_batch`].
pub fn record_bytes(&self, bytes: usize) {
self.wire_bytes.fetch_add(bytes, Ordering::Relaxed);
let mut cb = self.callback.lock().unwrap_or_else(|e| e.into_inner());
let guard = self
.rows_and_bytes
.lock()
.unwrap_or_else(|e| e.into_inner());
let progress = self.snapshot(guard.0, guard.1, false);
drop(guard);
cb(&progress);
}
/// Emit the final progress callback indicating the write is complete.
@@ -179,6 +169,8 @@ impl WriteProgressTracker {
let wire = self.wire_bytes.load(Ordering::Relaxed);
// Prefer wire bytes (actual I/O size) when the insert layer is
// tracking them; fall back to in-memory Arrow size otherwise.
// TODO: for local writes, track actual bytes written by Lance
// instead of using in-memory Arrow size as a proxy.
let output_bytes = if wire > 0 { wire } else { in_memory_bytes };
WriteProgress {
elapsed: self.start.elapsed(),
@@ -391,54 +383,6 @@ mod tests {
}
}
#[tokio::test]
async fn test_progress_uses_lance_write_bytes_for_local_tables() {
let dir = tempfile::tempdir().unwrap();
let db = connect(dir.path().to_str().unwrap())
.execute()
.await
.unwrap();
let batch = record_batch!(("id", Int32, [1, 2, 3])).unwrap();
let table = db
.create_table("local_write_bytes", batch)
.execute()
.await
.unwrap();
let new_data = record_batch!(("id", Int32, [4, 5, 6])).unwrap();
let in_memory_bytes = new_data.get_array_memory_size();
let final_bytes = Arc::new(AtomicUsize::new(0));
let seen_non_memory_bytes = Arc::new(std::sync::atomic::AtomicBool::new(false));
let final_bytes_cb = final_bytes.clone();
let seen_non_memory_bytes_cb = seen_non_memory_bytes.clone();
table
.add(new_data)
.write_parallelism(1)
.progress(move |p| {
if p.output_bytes() > 0 && p.output_bytes() != in_memory_bytes {
seen_non_memory_bytes_cb.store(true, Ordering::SeqCst);
}
if p.done() {
final_bytes_cb.store(p.output_bytes(), Ordering::SeqCst);
}
})
.execute()
.await
.unwrap();
assert!(
seen_non_memory_bytes.load(Ordering::SeqCst),
"progress should report Lance writer bytes, not only Arrow memory bytes"
);
assert_ne!(
final_bytes.load(Ordering::SeqCst),
in_memory_bytes,
"final progress bytes should come from Lance write stats"
);
}
#[test]
fn test_record_batch_recovers_from_poisoned_callback_lock() {
use super::{ProgressCallback, WriteProgressTracker};

View File

@@ -329,15 +329,6 @@ pub mod clock {
});
}
/// Start mock time at the current instant if not already pinned.
pub fn pin() {
MOCK_NOW.with(|mock| {
if mock.get().is_none() {
mock.set(Some(Instant::now()));
}
});
}
#[allow(dead_code)]
pub fn clear_mock() {
MOCK_NOW.with(|mock| mock.set(None));

View File

@@ -1,380 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! Integration tests for blob v2 columns.
use std::sync::Arc;
use arrow_array::{Array, BinaryArray, Int64Array, LargeBinaryArray, RecordBatch, StructArray};
use arrow_schema::{DataType, Field, Schema};
use futures::TryStreamExt;
use lance_encoding::version::LanceFileVersion;
use lancedb::{
Connection, Result, Table, blob::blob, connect,
database::listing::OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, query::ExecutableQuery,
};
use tempfile::tempdir;
fn blob_table_schema() -> Arc<Schema> {
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob("image", true),
]))
}
fn binary_input_batch(ids: &[i64], payloads: &[Option<&[u8]>]) -> RecordBatch {
RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::LargeBinary, true),
])),
vec![
Arc::new(Int64Array::from(ids.to_vec())),
Arc::new(LargeBinaryArray::from_iter(payloads.iter().copied())),
],
)
.unwrap()
}
async fn create_inline_blob_table(
db: &Connection,
name: &str,
ids: &[i64],
payloads: &[Option<&[u8]>],
) -> Result<Table> {
let table = db
.create_empty_table(name, blob_table_schema())
.execute()
.await?;
table
.add(binary_input_batch(ids, payloads))
.execute()
.await?;
Ok(table)
}
async fn storage_format_version(table: &Table) -> LanceFileVersion {
table
.as_native()
.unwrap()
.manifest()
.await
.unwrap()
.data_storage_format
.lance_file_version()
.unwrap()
.resolve()
}
async fn uses_stable_row_ids(table: &Table) -> bool {
table
.as_native()
.unwrap()
.manifest()
.await
.unwrap()
.uses_stable_row_ids()
}
async fn query_image_struct(table: &Table) -> StructArray {
let batches = table
.query()
.execute()
.await
.unwrap()
.try_collect::<Vec<_>>()
.await
.unwrap();
let batch = arrow_select::concat::concat_batches(&batches[0].schema(), &batches).unwrap();
batch
.column_by_name("image")
.expect("image column present")
.as_any()
.downcast_ref::<StructArray>()
.expect("blob column reads back as a descriptor struct")
.clone()
}
#[tokio::test]
async fn declaring_blob_column_bumps_format_and_enables_stable_row_ids() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn explicit_stable_row_id_setting_wins_over_blob_default() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.storage_option(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, "false")
.execute()
.await?;
assert!(
storage_format_version(&table).await >= LanceFileVersion::V2_2,
"format bump still applies; the schema cannot be written below 2.2"
);
assert!(!uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn non_blob_table_keeps_default_format_and_row_id_setting() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)]));
let table = db.create_empty_table("t", schema).execute().await?;
assert!(storage_format_version(&table).await < LanceFileVersion::V2_2);
assert!(!uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn creating_with_blob_data_bumps_format() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
// Batch already declares the blob field (pre-built struct).
let blob_field = blob("image", true);
let DataType::Struct(children) = blob_field.data_type().clone() else {
unreachable!("blob field is a struct")
};
let image = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"payload".as_slice()])),
Arc::new(arrow_array::StringArray::from(vec![None::<&str>])),
],
None,
);
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob_field,
])),
vec![Arc::new(Int64Array::from(vec![1])), Arc::new(image)],
)
.unwrap();
let table = db.create_table("t", batch).execute().await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(uses_stable_row_ids(&table).await);
assert_eq!(table.count_rows(None).await?, 1);
Ok(())
}
#[tokio::test]
async fn add_coerces_large_binary_into_blob_column() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table =
create_inline_blob_table(&db, "t", &[1, 2], &[Some(b"cat".as_slice()), Some(b"dog")])
.await?;
assert_eq!(table.count_rows(None).await?, 2);
let image = query_image_struct(&table).await;
assert_eq!(image.len(), 2);
// Table schema still has the blob marker after append.
let schema = table.schema().await?;
let field = schema.field_with_name("image").unwrap();
assert_eq!(
field
.metadata()
.get("ARROW:extension:name")
.map(String::as_str),
Some("lance.blob.v2")
);
Ok(())
}
#[tokio::test]
async fn add_coerces_binary_into_blob_column() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::Binary, true),
])),
vec![
Arc::new(Int64Array::from(vec![1])),
Arc::new(BinaryArray::from_iter_values([b"small".as_slice()])),
],
)
.unwrap();
table.add(batch).execute().await?;
assert_eq!(table.count_rows(None).await?, 1);
Ok(())
}
#[tokio::test]
async fn add_accepts_null_blob_rows() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = create_inline_blob_table(
&db,
"t",
&[1, 2, 3],
&[Some(b"first".as_slice()), None, Some(b"third")],
)
.await?;
assert_eq!(table.count_rows(None).await?, 3);
let image = query_image_struct(&table).await;
assert_eq!(image.len(), 3);
Ok(())
}
#[tokio::test]
async fn add_rejects_uncoercible_blob_input() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::Utf8, true),
])),
vec![
Arc::new(Int64Array::from(vec![1])),
Arc::new(arrow_array::StringArray::from(vec!["not bytes"])),
],
)
.unwrap();
let err = table.add(batch).execute().await.unwrap_err();
assert!(err.to_string().contains("image"), "got: {err}");
Ok(())
}
#[tokio::test]
async fn connection_level_stable_row_id_setting_wins_over_blob_default() -> Result<()> {
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap())
.storage_option(OPT_NEW_TABLE_ENABLE_STABLE_ROW_IDS, "false")
.execute()
.await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(!uses_stable_row_ids(&table).await);
Ok(())
}
#[tokio::test]
async fn namespace_create_applies_blob_defaults() -> Result<()> {
let tmp = tempdir().unwrap();
let mut properties = std::collections::HashMap::new();
properties.insert("root".to_string(), tmp.path().to_str().unwrap().to_string());
let db = lancedb::connect_namespace("dir", properties)
.execute()
.await?;
let table = db
.create_empty_table("t", blob_table_schema())
.execute()
.await?;
assert!(storage_format_version(&table).await >= LanceFileVersion::V2_2);
assert!(uses_stable_row_ids(&table).await);
Ok(())
}
// Overwrite takes the input schema as-is (same as cast skip). Raw binary
// overwrite drops the blob marker unless the input declares blob v2.
#[tokio::test]
async fn overwrite_replaces_blob_schema_with_input_schema() -> Result<()> {
use lancedb::table::AddDataMode;
let tmp = tempdir().unwrap();
let db = connect(tmp.path().to_str().unwrap()).execute().await?;
let table = create_inline_blob_table(&db, "t", &[1], &[Some(b"blob".as_slice())]).await?;
// Raw binary overwrite. Plain LargeBinary replaces the blob declaration.
let raw_schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
Field::new("image", DataType::LargeBinary, true),
]));
let raw_batch = RecordBatch::try_new(
raw_schema.clone(),
vec![
Arc::new(Int64Array::from(vec![2])),
Arc::new(LargeBinaryArray::from_iter_values([b"plain".as_slice()])),
],
)
.unwrap();
table
.add(raw_batch)
.mode(AddDataMode::Overwrite)
.execute()
.await?;
let schema = table.schema().await?;
assert_eq!(schema, raw_schema);
assert!(
!schema
.field_with_name("image")
.unwrap()
.metadata()
.contains_key("ARROW:extension:name"),
"raw binary overwrite leaves a plain binary column"
);
// Overwrite with a declared blob struct keeps the blob column.
let blob_field = blob("image", true);
let DataType::Struct(children) = blob_field.data_type().clone() else {
unreachable!("blob field is a struct")
};
let image = StructArray::new(
children,
vec![
Arc::new(LargeBinaryArray::from_iter_values([b"declared".as_slice()])),
Arc::new(arrow_array::StringArray::from(vec![None::<&str>])),
],
None,
);
let declared_batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![
Field::new("id", DataType::Int64, false),
blob_field,
])),
vec![Arc::new(Int64Array::from(vec![3])), Arc::new(image)],
)
.unwrap();
table
.add(declared_batch)
.mode(AddDataMode::Overwrite)
.execute()
.await?;
let schema = table.schema().await?;
assert_eq!(
schema
.field_with_name("image")
.unwrap()
.metadata()
.get("ARROW:extension:name")
.map(String::as_str),
Some("lance.blob.v2")
);
Ok(())
}