feat(python): support blob modes in query to_pandas (#3487)

## Feature

- What is the new feature?
- Adds `blob_mode` support to sync and async Python query `to_pandas()`
APIs.
- Enables plain scan queries to return blob columns as lazy `BlobFile`
objects, raw bytes, or blob descriptions.
- Lets namespace-backed local tables use Lance native blob-aware pandas
conversion for lazy blobs.

- Why do we need this feature?
- Table and Lance dataset/scanner APIs already support blob-aware pandas
conversion, but LanceDB query builders did not expose that capability.
- Geneva and other callers should be able to use query-level
`to_pandas(blob_mode=...)` without manually constructing Lance scanners.

- How does it work?
- Plain scan queries route through Lance scanner native
`to_pandas(blob_mode=...)`, preserving filter, projection, limit,
offset, row id, and alias/expression projection behavior.
- Non-native query shapes keep existing Arrow fallback semantics and
raise a clear error when they return blob columns with
`blob_mode="lazy"` or `blob_mode="bytes"`.
- Focused tests cover table/query blob modes,
filter/select/limit/offset/alias query cases, async query behavior,
vector-query error boundaries, and namespace-backed lazy blobs.

## Validation

- `cd python && .venv/bin/maturin develop --uv --extras tests,dev
--profile dev`
- `cd python && uv run --frozen --no-sync pytest
python/tests/test_table.py::test_table_to_pandas_blob_modes
python/tests/test_table.py::test_async_table_to_pandas_blob_bytes
python/tests/test_query.py::test_plain_scan_query_to_pandas_blob_modes
python/tests/test_query.py::test_plain_scan_query_to_pandas_blob_projection
python/tests/test_query.py::test_async_plain_scan_query_to_pandas_blob_projection
python/tests/test_query.py::test_vector_query_to_pandas_blob_mode_requires_native_path
python/tests/test_namespace.py::TestNamespaceConnection::test_table_to_pandas_blob_lazy_through_namespace
-q`
- `cd python && uv run --frozen --no-sync ruff format --check .`
- `cd python && uv run --frozen --no-sync ruff check .`
- `git diff --check`
This commit is contained in:
Yang Cen
2026-06-03 19:15:44 +08:00
committed by GitHub
parent 379684391e
commit 6f18eb4cce
5 changed files with 576 additions and 68 deletions

View File

@@ -26,6 +26,28 @@ from lancedb.table import LanceTable
from pydantic import BaseModel
def _blob_test_data():
return pa.table(
{
"id": pa.array([1, 2], pa.int64()),
"blob": pa.array([b"hello", b"world"], pa.large_binary()),
},
schema=pa.schema(
[
pa.field("id", pa.int64()),
pa.field(
"blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
),
]
),
)
def _assert_lazy_blob(value, expected: bytes):
assert hasattr(value, "readall")
assert value.readall() == expected
def test_basic(mem_db: DBConnection):
data = [
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
@@ -57,27 +79,30 @@ def test_table_to_pandas_default_matches_arrow(tmp_db: DBConnection):
pd.testing.assert_frame_equal(table.to_pandas(), expected)
def test_table_to_pandas_blob_bytes(tmp_db: DBConnection):
def test_table_to_pandas_invalid_blob_mode_non_blob_table(tmp_db: DBConnection):
data = pa.table({"id": [1, 2], "text": ["one", "two"]})
table = tmp_db.create_table("test_to_pandas_invalid_blob_mode", data=data)
with pytest.raises(ValueError, match="blob_mode must be one of"):
table.to_pandas(blob_mode="invalid")
@pytest.mark.parametrize("blob_mode", ["lazy", "bytes", "descriptions"])
def test_table_to_pandas_blob_modes(tmp_db: DBConnection, blob_mode):
pytest.importorskip("lance")
data = pa.table(
{
"id": pa.array([1, 2], pa.int64()),
"blob": pa.array([b"hello", b"world"], pa.large_binary()),
},
schema=pa.schema(
[
pa.field("id", pa.int64()),
pa.field(
"blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
),
]
),
)
table = tmp_db.create_table("test_to_pandas_blob_bytes", data=data)
table = tmp_db.create_table(f"test_to_pandas_blob_{blob_mode}", _blob_test_data())
df = table.to_pandas(blob_mode="bytes")
df = table.to_pandas(blob_mode=blob_mode)
assert df["blob"].tolist() == [b"hello", b"world"]
if blob_mode == "lazy":
_assert_lazy_blob(df["blob"].iloc[0], b"hello")
_assert_lazy_blob(df["blob"].iloc[1], b"world")
elif blob_mode == "bytes":
assert df["blob"].tolist() == [b"hello", b"world"]
else:
first = df["blob"].iloc[0]
assert first != b"hello"
assert not hasattr(first, "readall")
def test_table_to_pandas_kwargs(tmp_db: DBConnection):
@@ -93,22 +118,8 @@ def test_table_to_pandas_kwargs(tmp_db: DBConnection):
@pytest.mark.asyncio
async def test_async_table_to_pandas_blob_bytes(tmp_db_async: AsyncConnection):
pytest.importorskip("lance")
data = pa.table(
{
"id": pa.array([1, 2], pa.int64()),
"blob": pa.array([b"hello", b"world"], pa.large_binary()),
},
schema=pa.schema(
[
pa.field("id", pa.int64()),
pa.field(
"blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}
),
]
),
)
table = await tmp_db_async.create_table(
"test_async_to_pandas_blob_bytes", data=data
"test_async_to_pandas_blob_bytes", data=_blob_test_data()
)
df = await table.to_pandas(blob_mode="bytes")
@@ -116,6 +127,19 @@ async def test_async_table_to_pandas_blob_bytes(tmp_db_async: AsyncConnection):
assert df["blob"].tolist() == [b"hello", b"world"]
@pytest.mark.asyncio
async def test_async_table_to_pandas_invalid_blob_mode_non_blob_table(
tmp_db_async: AsyncConnection,
):
table = await tmp_db_async.create_table(
"test_async_to_pandas_invalid_blob_mode",
data=pa.table({"id": [1, 2], "text": ["one", "two"]}),
)
with pytest.raises(ValueError, match="blob_mode must be one of"):
await table.to_pandas(blob_mode="invalid")
@pytest.mark.asyncio
async def test_async_table_to_pandas_kwargs(tmp_db_async: AsyncConnection):
pd = pytest.importorskip("pandas")