mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-16 11:30:41 +00:00
Closes #3261. ## Summary Adds `bytes` to the accepted types of `lancedb.expr.lit()` so that binary scalars can be used in filter / projection expressions. The previous attempt in #3235 had to be reverted because DataFusion's SQL unparser does not support `Binary` / `LargeBinary` scalars, so any expression containing such a literal would fail in both `to_sql()` and `__repr__`. ## How `expr_to_sql_string` now has two paths: - **Fast path** (no binary literals): delegate to DataFusion's unparser unchanged. - **Slow path**: rewrite each `Binary(Some(bytes))` literal in the tree to a unique string-literal placeholder, run the unparser, then substitute `'<placeholder>'` with `X'<HEX>'` in the resulting SQL. `Binary(None)` / `LargeBinary(None)` are rewritten to `ScalarValue::Null` so the unparser emits plain `NULL`. This keeps DataFusion as the single source of truth for operator and function serialization, so binary literals work in every expression node type the unparser already supports — including nested cases like `contains(col("data"), lit(b"\xff"))`, `NOT (col == lit(b"..."))`, and `col.cast(...) == lit(b"...")`. ## Changes - `rust/lancedb/src/expr/sql.rs`: placeholder-substitution implementation. - `rust/lancedb/src/expr.rs`: 4 new unit tests covering binary literals in equality, compound predicates, scalar function calls, negation, and `NULL` binary literals. - `python/src/expr.rs`: `expr_lit` accepts `PyBytes` and produces `ScalarValue::Binary`. - `python/Cargo.toml` + `Cargo.lock`: pull in `datafusion-common` for `ScalarValue`. - `python/python/lancedb/expr.py`: extend `ExprLike` and `lit()` type annotations / docstrings with `bytes`. - `python/python/lancedb/_lancedb.pyi`: update `expr_lit` stub. - `python/tests/test_expr.py`: unit tests for `to_sql` / `repr` of binary literals and an integration test against a real `pa.binary()` column for equality / inequality / compound filters. ## Example ```python from lancedb.expr import col, lit, func # Equality against a binary column col("payload") == lit(b"\xca\xfe") # Expr((payload = X'CAFE')) # Nested inside a function call (previously failed) func("contains", col("data"), lit(b"\xff")) # Expr(contains(data, X'FF')) # repr() no longer crashes repr(lit(b"\xde\xad\xbe\xef")) # "Expr(X'DEADBEEF')" ``` ## Verification - [x] `cargo test -p lancedb --lib expr::` — 12/12 pass (was 9; +3 new tests) - [x] `cargo check --features remote --tests --examples` — clean - [x] `cargo clippy --features remote --tests --examples` — no warnings - [x] `cargo fmt --all -- --check` — clean - [x] `pytest python/tests/test_expr.py` — 76/76 pass (was 74; +2 new tests) - [x] `ruff check python` / `ruff format --check python` — clean ## Follow-ups (not in this PR) Issue #3261 also raises the possibility of a *truncated* `__repr__` for very large binary literals. This PR keeps `__repr__` exact (it forwards to `to_sql()`), since truncating display output would diverge from the SQL that actually gets executed. A display-only truncation could be added in a follow-up by giving `__repr__` its own renderer. Made with [Cursor](https://cursor.com) Co-authored-by: Cursor <cursoragent@cursor.com>
52 lines
1.3 KiB
TOML
52 lines
1.3 KiB
TOML
[package]
|
|
name = "lancedb-python"
|
|
version = "0.31.0-beta.11"
|
|
publish = false
|
|
edition.workspace = true
|
|
description = "Python bindings for LanceDB"
|
|
license.workspace = true
|
|
repository.workspace = true
|
|
keywords.workspace = true
|
|
categories.workspace = true
|
|
rust-version = "1.91.0"
|
|
|
|
[lib]
|
|
name = "_lancedb"
|
|
crate-type = ["cdylib"]
|
|
|
|
[dependencies]
|
|
arrow = { version = "58.0.0", features = ["pyarrow"] }
|
|
async-trait = "0.1"
|
|
bytes = "1"
|
|
lancedb = { path = "../rust/lancedb", default-features = false }
|
|
datafusion-common.workspace = true
|
|
lance-core.workspace = true
|
|
lance-namespace.workspace = true
|
|
lance-namespace-impls.workspace = true
|
|
lance-io.workspace = true
|
|
env_logger.workspace = true
|
|
log.workspace = true
|
|
pyo3 = { version = "0.28", features = ["extension-module", "abi3-py39"] }
|
|
pyo3-async-runtimes = { version = "0.28", features = [
|
|
"attributes",
|
|
"tokio-runtime",
|
|
] }
|
|
pin-project = "1.1.5"
|
|
futures.workspace = true
|
|
serde = "1"
|
|
serde_json = "1"
|
|
snafu.workspace = true
|
|
tokio = { version = "1.40", features = ["sync", "rt-multi-thread"] }
|
|
libc = "0.2"
|
|
|
|
[build-dependencies]
|
|
pyo3-build-config = { version = "0.28", features = [
|
|
"extension-module",
|
|
"abi3-py39",
|
|
] }
|
|
|
|
[features]
|
|
default = ["remote", "lancedb/aws", "lancedb/gcs", "lancedb/azure", "lancedb/dynamodb", "lancedb/oss", "lancedb/huggingface"]
|
|
fp16kernels = ["lancedb/fp16kernels"]
|
|
remote = ["lancedb/remote"]
|