feat(python): add type-safe expression builder API (#3150)

Introduces col(), lit(), func(), and Expr class as alternatives to raw
SQL strings in .where() and .select(). Expressions are backed by
DataFusion's Expr AST and serialized to SQL for remote table compat.

Resolves: 
- https://github.com/lancedb/lancedb/issues/3044 (python api's)
- https://github.com/lancedb/lancedb/issues/3043 (support for filter)
- https://github.com/lancedb/lancedb/issues/3045 (support for
projection)

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Dey
2026-04-01 00:02:49 +05:30
committed by GitHub
parent c7f189f27b
commit 7b1c063848
13 changed files with 1059 additions and 41 deletions

View File

@@ -18,6 +18,7 @@ from .db import AsyncConnection, DBConnection, LanceDBConnection
from .io import StorageOptionsProvider
from .remote import ClientConfig
from .remote.db import RemoteDBConnection
from .expr import Expr, col, lit, func
from .schema import vector
from .table import AsyncTable, Table
from ._lancedb import Session
@@ -271,6 +272,10 @@ __all__ = [
"AsyncConnection",
"AsyncLanceNamespaceDBConnection",
"AsyncTable",
"col",
"Expr",
"func",
"lit",
"URI",
"sanitize_uri",
"vector",

View File

@@ -27,6 +27,32 @@ from .remote import ClientConfig
IvfHnswPq: type[HnswPq] = HnswPq
IvfHnswSq: type[HnswSq] = HnswSq
class PyExpr:
"""A type-safe DataFusion expression node (Rust-side handle)."""
def eq(self, other: "PyExpr") -> "PyExpr": ...
def ne(self, other: "PyExpr") -> "PyExpr": ...
def lt(self, other: "PyExpr") -> "PyExpr": ...
def lte(self, other: "PyExpr") -> "PyExpr": ...
def gt(self, other: "PyExpr") -> "PyExpr": ...
def gte(self, other: "PyExpr") -> "PyExpr": ...
def and_(self, other: "PyExpr") -> "PyExpr": ...
def or_(self, other: "PyExpr") -> "PyExpr": ...
def not_(self) -> "PyExpr": ...
def add(self, other: "PyExpr") -> "PyExpr": ...
def sub(self, other: "PyExpr") -> "PyExpr": ...
def mul(self, other: "PyExpr") -> "PyExpr": ...
def div(self, other: "PyExpr") -> "PyExpr": ...
def lower(self) -> "PyExpr": ...
def upper(self) -> "PyExpr": ...
def contains(self, substr: "PyExpr") -> "PyExpr": ...
def cast(self, data_type: pa.DataType) -> "PyExpr": ...
def to_sql(self) -> str: ...
def expr_col(name: str) -> PyExpr: ...
def expr_lit(value: Union[bool, int, float, str]) -> PyExpr: ...
def expr_func(name: str, args: List[PyExpr]) -> PyExpr: ...
class Session:
def __init__(
self,
@@ -225,7 +251,9 @@ class RecordBatchStream:
class Query:
def where(self, filter: str): ...
def select(self, columns: Tuple[str, str]): ...
def where_expr(self, expr: PyExpr): ...
def select(self, columns: List[Tuple[str, str]]): ...
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
def select_columns(self, columns: List[str]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
@@ -251,7 +279,9 @@ class TakeQuery:
class FTSQuery:
def where(self, filter: str): ...
def select(self, columns: List[str]): ...
def where_expr(self, expr: PyExpr): ...
def select(self, columns: List[Tuple[str, str]]): ...
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
def fast_search(self): ...
@@ -270,7 +300,9 @@ class VectorQuery:
async def output_schema(self) -> pa.Schema: ...
async def execute(self) -> RecordBatchStream: ...
def where(self, filter: str): ...
def select(self, columns: List[str]): ...
def where_expr(self, expr: PyExpr): ...
def select(self, columns: List[Tuple[str, str]]): ...
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
def select_with_projection(self, columns: Tuple[str, str]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
@@ -287,7 +319,9 @@ class VectorQuery:
class HybridQuery:
def where(self, filter: str): ...
def select(self, columns: List[str]): ...
def where_expr(self, expr: PyExpr): ...
def select(self, columns: List[Tuple[str, str]]): ...
def select_expr(self, columns: List[Tuple[str, PyExpr]]): ...
def limit(self, limit: int): ...
def offset(self, offset: int): ...
def fast_search(self): ...

View File

@@ -0,0 +1,298 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
"""Type-safe expression builder for filters and projections.
Instead of writing raw SQL strings you can build expressions with Python
operators::
from lancedb.expr import col, lit
# filter: age > 18 AND status = 'active'
filt = (col("age") > lit(18)) & (col("status") == lit("active"))
# projection: compute a derived column
proj = {"score": col("raw_score") * lit(1.5)}
table.search().where(filt).select(proj).to_list()
"""
from __future__ import annotations
from typing import Union
import pyarrow as pa
from lancedb._lancedb import PyExpr, expr_col, expr_lit, expr_func
__all__ = ["Expr", "col", "lit", "func"]
_STR_TO_PA_TYPE: dict = {
"bool": pa.bool_(),
"boolean": pa.bool_(),
"int8": pa.int8(),
"int16": pa.int16(),
"int32": pa.int32(),
"int64": pa.int64(),
"uint8": pa.uint8(),
"uint16": pa.uint16(),
"uint32": pa.uint32(),
"uint64": pa.uint64(),
"float16": pa.float16(),
"float32": pa.float32(),
"float": pa.float32(),
"float64": pa.float64(),
"double": pa.float64(),
"string": pa.string(),
"utf8": pa.string(),
"str": pa.string(),
"large_string": pa.large_utf8(),
"large_utf8": pa.large_utf8(),
"date32": pa.date32(),
"date": pa.date32(),
"date64": pa.date64(),
}
def _coerce(value: "ExprLike") -> "Expr":
"""Return *value* as an :class:`Expr`, wrapping plain Python values via
:func:`lit` if needed."""
if isinstance(value, Expr):
return value
return lit(value)
# Type alias used in annotations.
ExprLike = Union["Expr", bool, int, float, str]
class Expr:
"""A type-safe expression node.
Construct instances with :func:`col` and :func:`lit`, then combine them
using Python operators or the named methods below.
Examples
--------
>>> from lancedb.expr import col, lit
>>> filt = (col("age") > lit(18)) & (col("name").lower() == lit("alice"))
>>> proj = {"double": col("x") * lit(2)}
"""
# Make Expr unhashable so that == returns an Expr rather than being used
# for dict keys / set membership.
__hash__ = None # type: ignore[assignment]
def __init__(self, inner: PyExpr) -> None:
self._inner = inner
# ── comparisons ──────────────────────────────────────────────────────────
def __eq__(self, other: ExprLike) -> "Expr": # type: ignore[override]
"""Equal to (``col("x") == 1``)."""
return Expr(self._inner.eq(_coerce(other)._inner))
def __ne__(self, other: ExprLike) -> "Expr": # type: ignore[override]
"""Not equal to (``col("x") != 1``)."""
return Expr(self._inner.ne(_coerce(other)._inner))
def __lt__(self, other: ExprLike) -> "Expr":
"""Less than (``col("x") < 1``)."""
return Expr(self._inner.lt(_coerce(other)._inner))
def __le__(self, other: ExprLike) -> "Expr":
"""Less than or equal to (``col("x") <= 1``)."""
return Expr(self._inner.lte(_coerce(other)._inner))
def __gt__(self, other: ExprLike) -> "Expr":
"""Greater than (``col("x") > 1``)."""
return Expr(self._inner.gt(_coerce(other)._inner))
def __ge__(self, other: ExprLike) -> "Expr":
"""Greater than or equal to (``col("x") >= 1``)."""
return Expr(self._inner.gte(_coerce(other)._inner))
# ── logical ──────────────────────────────────────────────────────────────
def __and__(self, other: "Expr") -> "Expr":
"""Logical AND (``expr_a & expr_b``)."""
return Expr(self._inner.and_(_coerce(other)._inner))
def __or__(self, other: "Expr") -> "Expr":
"""Logical OR (``expr_a | expr_b``)."""
return Expr(self._inner.or_(_coerce(other)._inner))
def __invert__(self) -> "Expr":
"""Logical NOT (``~expr``)."""
return Expr(self._inner.not_())
# ── arithmetic ───────────────────────────────────────────────────────────
def __add__(self, other: ExprLike) -> "Expr":
"""Add (``col("x") + 1``)."""
return Expr(self._inner.add(_coerce(other)._inner))
def __radd__(self, other: ExprLike) -> "Expr":
"""Right-hand add (``1 + col("x")``)."""
return Expr(_coerce(other)._inner.add(self._inner))
def __sub__(self, other: ExprLike) -> "Expr":
"""Subtract (``col("x") - 1``)."""
return Expr(self._inner.sub(_coerce(other)._inner))
def __rsub__(self, other: ExprLike) -> "Expr":
"""Right-hand subtract (``1 - col("x")``)."""
return Expr(_coerce(other)._inner.sub(self._inner))
def __mul__(self, other: ExprLike) -> "Expr":
"""Multiply (``col("x") * 2``)."""
return Expr(self._inner.mul(_coerce(other)._inner))
def __rmul__(self, other: ExprLike) -> "Expr":
"""Right-hand multiply (``2 * col("x")``)."""
return Expr(_coerce(other)._inner.mul(self._inner))
def __truediv__(self, other: ExprLike) -> "Expr":
"""Divide (``col("x") / 2``)."""
return Expr(self._inner.div(_coerce(other)._inner))
def __rtruediv__(self, other: ExprLike) -> "Expr":
"""Right-hand divide (``1 / col("x")``)."""
return Expr(_coerce(other)._inner.div(self._inner))
# ── string methods ───────────────────────────────────────────────────────
def lower(self) -> "Expr":
"""Convert string column values to lowercase."""
return Expr(self._inner.lower())
def upper(self) -> "Expr":
"""Convert string column values to uppercase."""
return Expr(self._inner.upper())
def contains(self, substr: "ExprLike") -> "Expr":
"""Return True where the string contains *substr*."""
return Expr(self._inner.contains(_coerce(substr)._inner))
# ── type cast ────────────────────────────────────────────────────────────
def cast(self, data_type: Union[str, "pa.DataType"]) -> "Expr":
"""Cast values to *data_type*.
Parameters
----------
data_type:
A PyArrow ``DataType`` (e.g. ``pa.int32()``) or one of the type
name strings: ``"bool"``, ``"int8"``, ``"int16"``, ``"int32"``,
``"int64"``, ``"uint8"````"uint64"``, ``"float32"``,
``"float64"``, ``"string"``, ``"date32"``, ``"date64"``.
"""
if isinstance(data_type, str):
try:
data_type = _STR_TO_PA_TYPE[data_type]
except KeyError:
raise ValueError(
f"unsupported data type: '{data_type}'. Supported: "
f"{', '.join(_STR_TO_PA_TYPE)}"
)
return Expr(self._inner.cast(data_type))
# ── named comparison helpers (alternative to operators) ──────────────────
def eq(self, other: ExprLike) -> "Expr":
"""Equal to."""
return self.__eq__(other)
def ne(self, other: ExprLike) -> "Expr":
"""Not equal to."""
return self.__ne__(other)
def lt(self, other: ExprLike) -> "Expr":
"""Less than."""
return self.__lt__(other)
def lte(self, other: ExprLike) -> "Expr":
"""Less than or equal to."""
return self.__le__(other)
def gt(self, other: ExprLike) -> "Expr":
"""Greater than."""
return self.__gt__(other)
def gte(self, other: ExprLike) -> "Expr":
"""Greater than or equal to."""
return self.__ge__(other)
def and_(self, other: "Expr") -> "Expr":
"""Logical AND."""
return self.__and__(other)
def or_(self, other: "Expr") -> "Expr":
"""Logical OR."""
return self.__or__(other)
# ── utilities ────────────────────────────────────────────────────────────
def to_sql(self) -> str:
"""Render the expression as a SQL string (useful for debugging)."""
return self._inner.to_sql()
def __repr__(self) -> str:
return f"Expr({self._inner.to_sql()})"
# ── free functions ────────────────────────────────────────────────────────────
def col(name: str) -> Expr:
"""Reference a table column by name.
Parameters
----------
name:
The column name.
Examples
--------
>>> from lancedb.expr import col, lit
>>> col("age") > lit(18)
Expr((age > 18))
"""
return Expr(expr_col(name))
def lit(value: Union[bool, int, float, str]) -> Expr:
"""Create a literal (constant) value expression.
Parameters
----------
value:
A Python ``bool``, ``int``, ``float``, or ``str``.
Examples
--------
>>> from lancedb.expr import col, lit
>>> col("price") * lit(1.1)
Expr((price * 1.1))
"""
return Expr(expr_lit(value))
def func(name: str, *args: ExprLike) -> Expr:
"""Call an arbitrary SQL function by name.
Parameters
----------
name:
The SQL function name (e.g. ``"lower"``, ``"upper"``).
*args:
The function arguments as :class:`Expr` or plain Python literals.
Examples
--------
>>> from lancedb.expr import col, func
>>> func("lower", col("name"))
Expr(lower(name))
"""
inner_args = [_coerce(a)._inner for a in args]
return Expr(expr_func(name, inner_args))

View File

@@ -38,6 +38,7 @@ from .rerankers.base import Reranker
from .rerankers.rrf import RRFReranker
from .rerankers.util import check_reranker_result
from .util import flatten_columns
from .expr import Expr
from lancedb._lancedb import fts_query_to_json
from typing_extensions import Annotated
@@ -449,8 +450,8 @@ class Query(pydantic.BaseModel):
ensure_vector_query,
] = None
# sql filter to refine the query with
filter: Optional[str] = None
# sql filter or type-safe Expr to refine the query with
filter: Optional[Union[str, Expr]] = None
# if True then apply the filter after vector search
postfilter: Optional[bool] = None
@@ -464,8 +465,8 @@ class Query(pydantic.BaseModel):
# distance type to use for vector search
distance_type: Optional[str] = None
# which columns to return in the results
columns: Optional[Union[List[str], Dict[str, str]]] = None
# which columns to return in the results (dict values may be str or Expr)
columns: Optional[Union[List[str], Dict[str, Union[str, Expr]]]] = None
# minimum number of IVF partitions to search
#
@@ -856,14 +857,15 @@ class LanceQueryBuilder(ABC):
self._offset = offset
return self
def select(self, columns: Union[list[str], dict[str, str]]) -> Self:
def select(self, columns: Union[list[str], dict[str, Union[str, Expr]]]) -> Self:
"""Set the columns to return.
Parameters
----------
columns: list of str, or dict of str to str default None
columns: list of str, or dict of str to str or Expr
List of column names to be fetched.
Or a dictionary of column names to SQL expressions.
Or a dictionary of column names to SQL expressions or
:class:`~lancedb.expr.Expr` objects.
All columns are fetched if None or unspecified.
Returns
@@ -877,15 +879,15 @@ class LanceQueryBuilder(ABC):
raise ValueError("columns must be a list or a dictionary")
return self
def where(self, where: str, prefilter: bool = True) -> Self:
def where(self, where: Union[str, Expr], prefilter: bool = True) -> Self:
"""Set the where clause.
Parameters
----------
where: str
The where clause which is a valid SQL where clause. See
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
for valid SQL expressions.
where: str or :class:`~lancedb.expr.Expr`
The filter condition. Can be a SQL string or a type-safe
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
and :func:`~lancedb.expr.lit`.
prefilter: bool, default True
If True, apply the filter before vector search, otherwise the
filter is applied on the result of vector search.
@@ -1355,15 +1357,17 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
return result_set
def where(self, where: str, prefilter: bool = None) -> LanceVectorQueryBuilder:
def where(
self, where: Union[str, Expr], prefilter: bool = None
) -> LanceVectorQueryBuilder:
"""Set the where clause.
Parameters
----------
where: str
The where clause which is a valid SQL where clause. See
`Lance filter pushdown <https://lance.org/guide/read_and_write#filter-push-down>`_
for valid SQL expressions.
where: str or :class:`~lancedb.expr.Expr`
The filter condition. Can be a SQL string or a type-safe
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
and :func:`~lancedb.expr.lit`.
prefilter: bool, default True
If True, apply the filter before vector search, otherwise the
filter is applied on the result of vector search.
@@ -2286,10 +2290,20 @@ class AsyncQueryBase(object):
"""
if isinstance(columns, list) and all(isinstance(c, str) for c in columns):
self._inner.select_columns(columns)
elif isinstance(columns, dict) and all(
isinstance(k, str) and isinstance(v, str) for k, v in columns.items()
):
self._inner.select(list(columns.items()))
elif isinstance(columns, dict) and all(isinstance(k, str) for k in columns):
if any(isinstance(v, Expr) for v in columns.values()):
# At least one value is an Expr — use the type-safe path.
from .expr import _coerce
pairs = [(k, _coerce(v)._inner) for k, v in columns.items()]
self._inner.select_expr(pairs)
elif all(isinstance(v, str) for v in columns.values()):
self._inner.select(list(columns.items()))
else:
raise TypeError(
"dict values must be str or Expr, got "
+ str({k: type(v) for k, v in columns.items()})
)
else:
raise TypeError("columns must be a list of column names or a dict")
return self
@@ -2529,11 +2543,13 @@ class AsyncStandardQuery(AsyncQueryBase):
"""
super().__init__(inner)
def where(self, predicate: str) -> Self:
def where(self, predicate: Union[str, Expr]) -> Self:
"""
Only return rows matching the given predicate
The predicate should be supplied as an SQL query string.
The predicate can be a SQL string or a type-safe
:class:`~lancedb.expr.Expr` built with :func:`~lancedb.expr.col`
and :func:`~lancedb.expr.lit`.
Examples
--------
@@ -2545,7 +2561,10 @@ class AsyncStandardQuery(AsyncQueryBase):
Filtering performance can often be improved by creating a scalar index
on the filter column(s).
"""
self._inner.where(predicate)
if isinstance(predicate, Expr):
self._inner.where_expr(predicate._inner)
else:
self._inner.where(predicate)
return self
def limit(self, limit: int) -> Self:

View File

@@ -4211,7 +4211,7 @@ class AsyncTable:
async_query = async_query.offset(query.offset)
if query.columns:
async_query = async_query.select(query.columns)
if query.filter:
if query.filter is not None:
async_query = async_query.where(query.filter)
if query.fast_search:
async_query = async_query.fast_search()