From 85d9c1ce63aaa1e4210db8e93c2c2debb316e5c0 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 10 Jun 2026 15:28:19 -0700 Subject: [PATCH] feat: adds isin support to the 'Expr' builder (#3523) The `Expr` build already includes a lot of useful filtering options, `eq, ne, gt/gte, lt/lte, and_, or_, contains, cast`, but is was missing a membership like `isin`. This PR adds that support, as minimally as possible, allowing easy filtering for membership in a list, without needing to be a series of `where` expressions. I didn't see anything in CONTRIBUTING.md about needing a feature request or issue first, so I just made the change. My apologies if I missed that somewhere. Thanks for the vector store, we're using it now in paperless-ngx. --- .gitignore | 1 + python/python/lancedb/_lancedb.pyi | 1 + python/python/lancedb/expr.py | 7 ++++++- python/src/expr.rs | 12 +++++++++++- python/tests/test_expr.py | 21 +++++++++++++++++++++ rust/lancedb/src/expr.rs | 11 +++++++++++ 6 files changed, 51 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index a6c97002a..0679f3c8a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ python/dist *.so *.dylib *.dll +*.pdb ## Javascript *.node diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index 5683a9c9b..d8b92d20e 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -48,6 +48,7 @@ class PyExpr: def lower(self) -> "PyExpr": ... def upper(self) -> "PyExpr": ... def contains(self, substr: "PyExpr") -> "PyExpr": ... + def isin(self, values: List["PyExpr"]) -> "PyExpr": ... def cast(self, data_type: pa.DataType) -> "PyExpr": ... def to_sql(self) -> str: ... diff --git a/python/python/lancedb/expr.py b/python/python/lancedb/expr.py index 08cb91c50..c28bc208a 100644 --- a/python/python/lancedb/expr.py +++ b/python/python/lancedb/expr.py @@ -19,7 +19,7 @@ operators:: from __future__ import annotations -from typing import Union +from typing import Iterable, Union import pyarrow as pa @@ -174,6 +174,11 @@ class Expr: """Return True where the string contains *substr*.""" return Expr(self._inner.contains(_coerce(substr)._inner)) + def isin(self, values: "Iterable[ExprLike]") -> "Expr": + """Return True where the value is one of *values* (SQL ``IN``).""" + inner = [_coerce(v)._inner for v in values] + return Expr(self._inner.isin(inner)) + # ── type cast ──────────────────────────────────────────────────────────── def cast(self, data_type: Union[str, "pa.DataType"]) -> "Expr": diff --git a/python/src/expr.rs b/python/src/expr.rs index b322c5bdf..d3e9ea5aa 100644 --- a/python/src/expr.rs +++ b/python/src/expr.rs @@ -9,7 +9,9 @@ use arrow::{datatypes::DataType, pyarrow::PyArrowType}; use datafusion_common::ScalarValue; -use lancedb::expr::{DfExpr, col as ldb_col, contains, expr_cast, lit as df_lit, lower, upper}; +use lancedb::expr::{ + DfExpr, col as ldb_col, contains, expr_cast, is_in, lit as df_lit, lower, upper, +}; use pyo3::types::PyBytes; use pyo3::{Bound, PyAny, PyResult, exceptions::PyValueError, prelude::*, pyfunction}; @@ -105,6 +107,14 @@ impl PyExpr { Self(contains(self.0.clone(), substr.0.clone())) } + // ── membership ─────────────────────────────────────────────────────────── + + /// Return true where the value is one of the given expressions (SQL ``IN``). + fn isin(&self, list: Vec) -> Self { + let items: Vec = list.into_iter().map(|e| e.0).collect(); + Self(is_in(self.0.clone(), items)) + } + // ── type cast ──────────────────────────────────────────────────────────── /// Cast the expression to `data_type`. diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index c4f68b1e2..7858995e7 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -450,6 +450,27 @@ def binary_table(tmp_path): return db.create_table("binary_test", data) +class TestExprIsin: + def test_isin_ints(self): + assert col("id").isin([1, 2, 3]).to_sql() == "id IN (1, 2, 3)" + + def test_isin_strs(self): + assert ( + col("status").isin(["active", "pending"]).to_sql() + == "status IN ('active', 'pending')" + ) + + def test_isin_coerces_and_mixes(self): + assert col("id").isin([lit(1), 2]).to_sql() == "id IN (1, 2)" + + def test_isin_empty(self): + assert col("id").isin([]).to_sql() == "id IN ()" + + def test_isin_filter(self, simple_table): + result = simple_table.search().where(col("id").isin([1, 3, 5])).to_arrow() + assert result.num_rows == 3 + + class TestExprBytesIntegration: def test_binary_equality_filter(self, binary_table): result = ( diff --git a/rust/lancedb/src/expr.rs b/rust/lancedb/src/expr.rs index 02b6b7d08..de245d796 100644 --- a/rust/lancedb/src/expr.rs +++ b/rust/lancedb/src/expr.rs @@ -57,6 +57,10 @@ pub fn expr_cast(expr: Expr, data_type: DataType) -> Expr { cast(expr, data_type) } +pub fn is_in(expr: Expr, list: Vec) -> Expr { + expr.in_list(list, false) +} + lazy_static::lazy_static! { static ref FUNC_REGISTRY: std::sync::RwLock>> = { let mut m = std::collections::HashMap::new(); @@ -194,6 +198,13 @@ mod tests { assert_eq!(sql, "NOT (data = X'ABCD')"); } + #[test] + fn test_is_in() { + let expr = is_in(col("id"), vec![lit(1i64), lit(2i64), lit(3i64)]); + let sql = expr_to_sql_string(&expr).unwrap(); + assert!(sql.contains("IN"), "expected IN in: {}", sql); + } + #[test] fn test_multiple_binary_literals() { use datafusion_common::ScalarValue;