Files
lancedb/python/src/expr.rs
Shengan Zhang 64aeee84a8 feat(python): support bytes in lit() expressions (#3387)
Closes #3261.

## Summary

Adds `bytes` to the accepted types of `lancedb.expr.lit()` so that
binary scalars can be used in filter / projection expressions. The
previous attempt in #3235 had to be reverted because DataFusion's SQL
unparser does not support `Binary` / `LargeBinary` scalars, so any
expression containing such a literal would fail in both `to_sql()` and
`__repr__`.

## How

`expr_to_sql_string` now has two paths:

- **Fast path** (no binary literals): delegate to DataFusion's unparser
unchanged.
- **Slow path**: rewrite each `Binary(Some(bytes))` literal in the tree
to a unique string-literal placeholder, run the unparser, then
substitute `'<placeholder>'` with `X'<HEX>'` in the resulting SQL.
`Binary(None)` / `LargeBinary(None)` are rewritten to
`ScalarValue::Null` so the unparser emits plain `NULL`.

This keeps DataFusion as the single source of truth for operator and
function serialization, so binary literals work in every expression node
type the unparser already supports — including nested cases like
`contains(col("data"), lit(b"\xff"))`, `NOT (col == lit(b"..."))`, and
`col.cast(...) == lit(b"...")`.

## Changes

- `rust/lancedb/src/expr/sql.rs`: placeholder-substitution
implementation.
- `rust/lancedb/src/expr.rs`: 4 new unit tests covering binary literals
in equality, compound predicates, scalar function calls, negation, and
`NULL` binary literals.
- `python/src/expr.rs`: `expr_lit` accepts `PyBytes` and produces
`ScalarValue::Binary`.
- `python/Cargo.toml` + `Cargo.lock`: pull in `datafusion-common` for
`ScalarValue`.
- `python/python/lancedb/expr.py`: extend `ExprLike` and `lit()` type
annotations / docstrings with `bytes`.
- `python/python/lancedb/_lancedb.pyi`: update `expr_lit` stub.
- `python/tests/test_expr.py`: unit tests for `to_sql` / `repr` of
binary literals and an integration test against a real `pa.binary()`
column for equality / inequality / compound filters.

## Example

```python
from lancedb.expr import col, lit, func

# Equality against a binary column
col("payload") == lit(b"\xca\xfe")
# Expr((payload = X'CAFE'))

# Nested inside a function call (previously failed)
func("contains", col("data"), lit(b"\xff"))
# Expr(contains(data, X'FF'))

# repr() no longer crashes
repr(lit(b"\xde\xad\xbe\xef"))
# "Expr(X'DEADBEEF')"
```

## Verification

- [x] `cargo test -p lancedb --lib expr::` — 12/12 pass (was 9; +3 new
tests)
- [x] `cargo check --features remote --tests --examples` — clean
- [x] `cargo clippy --features remote --tests --examples` — no warnings
- [x] `cargo fmt --all -- --check` — clean
- [x] `pytest python/tests/test_expr.py` — 76/76 pass (was 74; +2 new
tests)
- [x] `ruff check python` / `ruff format --check python` — clean

## Follow-ups (not in this PR)

Issue #3261 also raises the possibility of a *truncated* `__repr__` for
very large binary literals. This PR keeps `__repr__` exact (it forwards
to `to_sql()`), since truncating display output would diverge from the
SQL that actually gets executed. A display-only truncation could be
added in a follow-up by giving `__repr__` its own renderer.

Made with [Cursor](https://cursor.com)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-14 15:24:52 -07:00

182 lines
6.6 KiB
Rust

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
//! PyO3 bindings for the LanceDB expression builder API.
//!
//! This module exposes [`PyExpr`] and helper free functions so Python can
//! build type-safe filter / projection expressions that map directly to
//! DataFusion [`Expr`] nodes, bypassing SQL string parsing.
use arrow::{datatypes::DataType, pyarrow::PyArrowType};
use datafusion_common::ScalarValue;
use lancedb::expr::{DfExpr, col as ldb_col, contains, expr_cast, lit as df_lit, lower, upper};
use pyo3::types::PyBytes;
use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunction};
/// A type-safe DataFusion expression.
///
/// Instances are constructed via the free functions [`expr_col`] and
/// [`expr_lit`] and combined with the methods on this struct. On the Python
/// side a thin wrapper class (`lancedb.expr.Expr`) delegates to these methods
/// and adds Python operator overloads.
#[pyclass(name = "PyExpr", from_py_object)]
#[derive(Clone)]
pub struct PyExpr(pub DfExpr);
#[pymethods]
impl PyExpr {
// ── comparisons ──────────────────────────────────────────────────────────
fn eq(&self, other: &Self) -> Self {
Self(self.0.clone().eq(other.0.clone()))
}
fn ne(&self, other: &Self) -> Self {
Self(self.0.clone().not_eq(other.0.clone()))
}
fn lt(&self, other: &Self) -> Self {
Self(self.0.clone().lt(other.0.clone()))
}
fn lte(&self, other: &Self) -> Self {
Self(self.0.clone().lt_eq(other.0.clone()))
}
fn gt(&self, other: &Self) -> Self {
Self(self.0.clone().gt(other.0.clone()))
}
fn gte(&self, other: &Self) -> Self {
Self(self.0.clone().gt_eq(other.0.clone()))
}
// ── logical ──────────────────────────────────────────────────────────────
fn and_(&self, other: &Self) -> Self {
Self(self.0.clone().and(other.0.clone()))
}
fn or_(&self, other: &Self) -> Self {
Self(self.0.clone().or(other.0.clone()))
}
fn not_(&self) -> Self {
use std::ops::Not;
Self(self.0.clone().not())
}
// ── arithmetic ───────────────────────────────────────────────────────────
fn add(&self, other: &Self) -> Self {
use std::ops::Add;
Self(self.0.clone().add(other.0.clone()))
}
fn sub(&self, other: &Self) -> Self {
use std::ops::Sub;
Self(self.0.clone().sub(other.0.clone()))
}
fn mul(&self, other: &Self) -> Self {
use std::ops::Mul;
Self(self.0.clone().mul(other.0.clone()))
}
fn div(&self, other: &Self) -> Self {
use std::ops::Div;
Self(self.0.clone().div(other.0.clone()))
}
// ── string functions ─────────────────────────────────────────────────────
/// Convert string column to lowercase.
fn lower(&self) -> Self {
Self(lower(self.0.clone()))
}
/// Convert string column to uppercase.
fn upper(&self) -> Self {
Self(upper(self.0.clone()))
}
/// Test whether the string contains `substr`.
fn contains(&self, substr: &Self) -> Self {
Self(contains(self.0.clone(), substr.0.clone()))
}
// ── type cast ────────────────────────────────────────────────────────────
/// Cast the expression to `data_type`.
///
/// `data_type` must be a PyArrow `DataType` (e.g. `pa.int32()`).
/// On the Python side, `lancedb.expr.Expr.cast` also accepts type name
/// strings via `pa.lib.ensure_type` before forwarding here.
fn cast(&self, data_type: PyArrowType<DataType>) -> Self {
Self(expr_cast(self.0.clone(), data_type.0))
}
// ── utilities ────────────────────────────────────────────────────────────
/// Render the expression as a SQL string (useful for debugging).
fn to_sql(&self) -> PyResult<String> {
lancedb::expr::expr_to_sql_string(&self.0).map_err(|e| PyValueError::new_err(e.to_string()))
}
fn __repr__(&self) -> PyResult<String> {
let sql =
lancedb::expr::expr_to_sql_string(&self.0).unwrap_or_else(|_| "<expr>".to_string());
Ok(format!("PyExpr({})", sql))
}
}
// ── free functions ────────────────────────────────────────────────────────────
/// Create a column reference expression.
///
/// The column name is preserved exactly as given (case-sensitive), so
/// `col("firstName")` correctly references a field named `firstName`.
#[pyfunction]
pub fn expr_col(name: &str) -> PyExpr {
PyExpr(ldb_col(name))
}
/// Create a literal value expression.
///
/// Supported Python types: `bool`, `int`, `float`, `str`, `bytes`.
#[pyfunction]
pub fn expr_lit(value: Bound<'_, PyAny>) -> PyResult<PyExpr> {
// bool must be checked before int because bool is a subclass of int in Python
if let Ok(b) = value.extract::<bool>() {
return Ok(PyExpr(df_lit(b)));
}
if let Ok(i) = value.extract::<i64>() {
return Ok(PyExpr(df_lit(i)));
}
if let Ok(f) = value.extract::<f64>() {
return Ok(PyExpr(df_lit(f)));
}
if let Ok(s) = value.extract::<String>() {
return Ok(PyExpr(df_lit(s)));
}
if value.is_instance_of::<PyBytes>() {
let bytes = value.extract::<Vec<u8>>()?;
return Ok(PyExpr(df_lit(ScalarValue::Binary(Some(bytes)))));
}
Err(PyValueError::new_err(format!(
"unsupported literal type: {}. Supported: bool, int, float, str, bytes",
value.get_type().name()?
)))
}
/// Call an arbitrary registered SQL function by name.
///
/// See `lancedb::expr::func` for the list of supported function names.
#[pyfunction]
pub fn expr_func(name: &str, args: Vec<PyExpr>) -> PyResult<PyExpr> {
let df_args: Vec<DfExpr> = args.into_iter().map(|e| e.0).collect();
lancedb::expr::func(name, df_args)
.map(PyExpr)
.map_err(|e| PyValueError::new_err(e.to_string()))
}